阅读代码点时发生偏移

简历 :我目前正在写一个ActionScript 3词法分析器,将源代码转换为令牌。 我select用代码点解释input,一个包含在UString类中的可选代理对的string。 在引擎盖下,我使用UStringPos类来caching最后一个读取位置。

我已经testing了它如何扫描标识符"huehuehue"

 'use strict'; import {Lexer} from 'core/Lexer'; import {UString} from 'utils/UString'; import ErrorHandler from 'core/ErrorHandler'; const errorHandler = new ErrorHandler(true); // Tell the length to the `Lexer` manually. const lexer = new Lexer( new UString('huehuehue'), 9, errorHandler); // Scan first token lexer.next(); const id = lexer.lookahead.value; console.log( id, id.length ); 

它应该logging"huehuehue", 9 ,但是另一个故事…

为什么它错过了最后的'e' ? 与扫描相关的最里面的方法是Lexer#getCommonIdentifierUString说一下,我已经testing了我的UString部分,并且它工作正常。

Lexer相关定义

 /* * Class that turns AS3 code into tokens. */ export class Lexer { /* * @param {UString} source * @param {Number} length * @param {ErrorHandler} errorHandler */ constructor(source, length, errorHandler) { this.source = source; this.length = length; this.index = 0; this.lineStart = 0; this.lineNumber = 1; this.comments = []; this.errorHandler = errorHandler; this.previousToken = null; this.token = null; this.lookahead = null; this._special = []; } /* * Verifies the end of file. */ eof() { return this.index >= this.length; } /* * Advance the previous, current and lookahead tokens. * The lexer however does not depend on these tokens. */ next() { this.previousToken = this.token; this.token = this.lookahead; this.lookahead = this.lex(); } /* * Consumes the next token and return it. */ lex() { this.consumeWhiteSpaces(); while (this.consumeComment()) this.consumeWhiteSpaces(); let cp = this.source.codePointAt(this.index); let pureIdentifier = Character.isIdentifierStart(cp); if (pureIdentifier || (cp === 0x5C)) return this.scanIdentifierOrKeyword(!pureIdentifier); if (this.eof()) { let loc = [ this.index, this.lineNumber ]; return new Token(TokenType.EOF, loc, loc, '<end>'); } } /* * Scan an identifier, keyword or boolean literal. */ scanIdentifierOrKeyword(usingEscape) { const start = this.index; let id; /* Like Esprima does: only identifiers containing * escapes need some overheads. */ if (usingEscape) { id = this.getEscapedIdentifier( String.fromCodePoint(this.scanUnicodeEscapeSequence())); } else id = this.getCommonIdentifier(); return new Token( TokenType.IDENTIFIER, [ start , this.lineNumber ], [ this.index, this.lineNumber ], id ); } /* * Interprets an identifier. If any escape appears, switches to * getEscapedIdentifier(). */ getCommonIdentifier() { const start = this.source.position.offset; let cp = 0; // Jump the starting symbol. ++this.index; while (!this.eof()) { cp = this.source.codePointAt(this.index); if (Character.isIdentifierPart(cp)) ++this.index; // Switches to escape-minded task... else if (cp === 0x5C) return this.getUnicodeEscapedIdentifier( this.source.string.slice( start, this.source.position.offset ) ); else break; } return this.source.string.slice( start, this.source.position.offset ); } /* ... */ } 

utils的/ UString.js

 'use strict'; /* * String wrapper with methods _based_ on code points. */ export class UString { /* * Constructs the {UString}. * * @param {String} s String to be wrapped. */ constructor(s) { /* * @type {String} */ this.string = s; /* * Tracks the last accessed position. * * @type {UStringPos} */ this.position = new UStringPos(0, 0); } /* * Reads a code point at specific index. * * @param {Number} index * @return {Number} */ codePointAt(index) { this.position.walk(this.string, index); return this.string.codePointAt(this.position.offset); } /* * Slices the internal string by code point indices. * * @param {Number} i * @param {Number} j * @return {String} */ slice(i, j) { this.position.walk(this.string, i); i = this.position.offset; this.position.walk(this.string, j); j = this.position.offset; return this.string.slice(i, j); } }; /* * Class that tracks the position of a code point on a string. */ export class UStringPos { /* * Constructs the {UStringPos}. * * @param {Number} index The initial index. * @param {Number} offset The initial offset. */ constructor(index, offset) { /* * @type {Number} */ this.index = index; /* * @type {Number} */ this.offset = offset; } /* * Walks to the given index. * * @param {String} s * @param {Number} index * @note No backward. Track the previous position instead. * @return {void} */ walk(s, index) { for (; this.index < index; ++this.index) this.offset += ( this._usingSurrogates( s.charCodeAt(this.offset) ) ? 2 : 1 ); } /* * @private */ _usingSurrogates(ch) { return (ch >= 0xD800) && (ch <= 0xDBFF); } }; 

什么?

好的。 所以这是一个this.source.position.offset的问题:当我做++this.index ,我UStringPos的偏移量不会更新。 问题是切片的事情。

  this.source.string.slice( start, this.source.position.offset ); 

这个片段是基于偏移量的,因为我必须跟踪标识符开始的前一个偏移量。

我可以使用我自己的UString类的切片,并使用第一个参数作为偏移量,最后一个作为正常的索引。

 'use strict'; export class UString { // ... /* * Slices the internal string by using a pair of * offset and code point indices. * * @param {Number} i Offset * @param {Number} j * @return {String} */ slice(i, j) { this.position.walk(this.string, j); j = this.position.offset; return this.string.slice(i, j); } };