Source: parser/lexer.js

/** Token object
 * @constructor
 * @param {string}  type   The type of token
 * @param           value  The value of the token
 * @param {number}  begin  The start index of the input region represented by the token
 * @param {number}  end    The index of the character one past the end of the input region represented by the token
 * @param {string}  text   The text of the token
 */
Parser.Token = function(type, value, begin, end, text) {
    this.type = type;
    this.value = value;
    this.begin = begin;
    this.end = end;
    this.text = text;
}
Parser.Token.prototype.toString = function() {
    return this.type + ' \''+this.text+'\'';
}

/** Dictionary of token types */
Parser.TokenType = {
    Identifier: 'Identifier',
    Register: 'Register',
    Number: 'Number',
    Multiplication: 'Multiplication',
    Division: 'Division',
    Remainder: 'Remainder',
    Addition: 'Addition',
    Subtraction: 'Subtraction',
    LogicalShiftLeft: 'LogicalShiftLeft',
    LogicalShiftRight: 'LogicalShiftRight',
    ArithmeticShiftRight: 'ArithmeticShiftRight',
    LessThan: 'LessThan',
    LessEqual: 'LessEqual',
    GreaterThan: 'GreaterThan',
    GreaterEqual: 'GreaterEqual',
    Assignment: 'Assignment',
    Equals: 'Equals',
    NotEquals: 'NotEquals',
    BitwiseAND: 'BitwiseAND',
    BitwiseXOR: 'BitwiseXOR',
    BitwiseOR: 'BitwiseOR',
    LogicalAND: 'LogicalAND',
    LogicalOR: 'LogicalOR',

    LogicalNOT: 'LogicalNOT',
    BitwiseNOT: 'BitwiseNOT',
    LParen: 'LParen',
    RParen: 'RParen',
    Comma: 'Comma',
    Colon: 'Colon',
    QuestionMark: 'QuestionMark',
    EndOfString: 'EndOfString',
};

/** General lexer
 * Transforms an input string into a stream of tokens
 * @constructor
 * @param {string} input    The string to transform
 */
Parser.Lexer = function(input) {
    /**
     * Index of the next character to be processed
     * @member Parser.Lexer
     * @private
     * @type {Number}
     */
    let index = 0;
    /**
     * Start index of the current token
     * @member Parser.Lexer
     * @private
     * @type {Number}
     */
    let marker = 0;

    /** Determine whether we have reached the end of the input string
     * @member Parser.Lexer
     * @private
     * @returns {boolean} <code>true</code> if and only if there are no more characters left
     */
    function endOfString() {
        return index >= input.length;
    }

    /** Start the next token
     * @member Parser.Lexer
     * @private
     */
    function startToken() {
        marker = index;
    }

    /** Determine the next character without advancing the input position
     * @member Parser.Lexer
     * @private
     * @returns {(string|undefined)} The next character of undefined if there are no more characters left.
     */
    function peekNextChar() {
        return (!endOfString() ? input.charAt(index) : undefined);
    }

    /** Skip the next character, if any 
     * @member Parser.Lexer
     * @private
     */
    function skipChar() {
        if (!endOfString()) {
            index++;
        }
    }

    /** Throw a {@link Parser.LexerError}, specifying the current input token
     * @member Parser.Lexer
     * @private
     * @param {string} message A human-readable message explaining the kind of exception
     * @throws {Parser.LexerError}
     */
    function error(message) {
        throw new Parser.LexerError(message, input.substring(marker, index), marker, index);
    }

    /** Skip a comment until the end of the line
     * @member Parser.Lexer
     * @private
     */
    function skipComment() {
        skipChar();
        let ch = peekNextChar();
        while (!endOfString() && ch != '\n') {
            skipChar();
            ch = peekNextChar();
        }
    }

    /** Determine whether the given character is whitespace
     * @member Parser.Lexer
     * @private
     * @param {string} ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character is whitespace
     */
    function isWhitespace(ch) {
        return (ch=='\t' || ch==' ' || ch=='\n' || ch=='\r');
    }

    let digitsUpperCase = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    let digitsLowerCase = "0123456789abcdefghijklmnopqrstuvwxyz";
    
    /** Determine whether the given character is a digit for the given base
     * 
     * Ignores case, i.e. 'a' and 'A' are considered the same character.
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @param {number}  base The base to check against (e.g., 10 for decimal numbers, 16 for hexadecimal, etc.)
     * @return {boolean} <code>true</code> if and only if the character is a digit for the given base
     */
    function isBaseDigit(ch, base) {
        assert(base <= 36);
        if (ch >= '0' && ch <= '9') {
            return (base >= 10 || digitsUpperCase.charAt(base));
        } else if (ch >= 'A' && ch <= 'Z') {
            return (ch <= digitsUpperCase.charAt(base));
        } else if (ch >= 'a' && ch <= 'z') {
            return (ch <= digitsLowerCase.charAt(base));
        } else {
            return false;
        }
    }
    
    /** Determine whether the given character is a decimal digit
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character is a decimal digit
     */
    function isDigit(ch) {
        return (ch >= '0' && ch <= '9');
    }
    
    /** Determine whether the given character is an octal digit
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character is an octal digit
     */
    function isOctalDigit(ch) {
        return (ch >= '0' && ch <= '7');
    }
    
    /** Determine whether the given character is a letter
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character is a letter
     */
    function isLetter(ch) {
        return ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'));
    }
    
    /** Determine whether the given character can be the start of an identifier
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character can be the start of an identifier
     */
    function isIdentifierStart(ch) {
        return isLetter(ch) || ch=='_';
    }
    
    /** Determine whether the given character can be part of an identifier
     * @member Parser.Lexer
     * @private
     * @param {string}  ch   The character to check
     * @return {boolean} <code>true</code> if and only if the character can be part of an identifier
     */
    function isIdentifierPart(ch) {
        return isIdentifierStart(ch) || isDigit(ch);
    }
    
    /** Skip spaces until the first non-whitespace character
     * @member Parser.Lexer
     * @private
     */
    function skipSpaces() {
        let ch;
        while (!endOfString()) {
            ch = peekNextChar();
            if (ch == '#') {
                skipComment();
            } else if (!isWhitespace(ch)) {
                break;
            }
            skipChar();
        }
    }

    /** Create a token from the currently parsed input
     * @member Parser.Lexer
     * @private
     * @param {string}   type   The type of the token to create
     * @param            value  The value of the token
     * @returns {Parser.Token}  The newly created token
     */
    function createToken(type, value) {
        return new Parser.Token(type, value, marker, index, input.substring(marker, index));
    }

    /** Parse a number of the given base
     * @member Parser.Lexer
     * @private
     * @param {number}   base    The base of the number to parse
     * @returns {Parser.Token}  The token representing the parsed number
     */
    function parseAnyNumber(base) {
        let str = '';
        let ch = peekNextChar()
        while (isBaseDigit(ch, base)) {
            skipChar();
            str = str + ch;
            ch = peekNextChar();
        }
        return createToken(Parser.TokenType.Number, parseInt(str, base));
    }

    /** Parse a register name
     * A register name is composed of a dollar sign followed by an identifier or number
     * @member Parser.Lexer
     * @private
     * @returns {Parser.Token} The token representing the register
     */
    function parseRegister() {
        let id = peekNextChar();
        skipChar();
        let ch = peekNextChar();
        while (isIdentifierPart(ch)) {
            id = id + ch;
            skipChar();
            ch = peekNextChar();
        }
        return createToken(Parser.TokenType.Register, id);
    }

    /** Parse a number
     * A number can be binary, octal, decimal or hexadecimal.
     * 
     * @member Parser.Lexer
     * @private
     * @returns {Parser.Token}  The token representing the parsed number
     */
    function parseNumber() {
        let ch = peekNextChar();
        if (ch == '0') {
            skipChar();
            ch = peekNextChar();
            switch (ch) {
            case 'b': case 'B':
                skipChar();
                return parseAnyNumber(2);
            case 'x': case 'X':
                skipChar();
                return parseAnyNumber(16);
            default:
                if (isOctalDigit(ch)) {
                    return parseAnyNumber(8);
                } else {
                    return createToken(Parser.TokenType.Number, 0);
                }
            }
        } else {
            return parseAnyNumber(10);
        }
    }

    /** Parse an identifier
     * @member Parser.Lexer
     * @private
     * @returns {Parser.Token}  A token representing the identifier
     */
    function parseIdentifier() {
        let id = peekNextChar();
        skipChar();
        let ch = peekNextChar();
        while (isIdentifierPart(ch)) {
            id = id + ch;
            skipChar();
            ch = peekNextChar();
        }
        return createToken(Parser.TokenType.Identifier, id);
    }

    let atomTypes = {
        '+': Parser.TokenType.Addition,
        '-': Parser.TokenType.Subtraction,
        '*': Parser.TokenType.Multiplication,
        '/': Parser.TokenType.Division,
        '%': Parser.TokenType.Remainder,
        '^': Parser.TokenType.BitwiseXOR,
        '~': Parser.TokenType.BitwiseNOT,
        '(': Parser.TokenType.LParen,
        ')': Parser.TokenType.RParen,
        ',': Parser.TokenType.Comma,
        ':': Parser.TokenType.Colon,
        '?': Parser.TokenType.QuestionMark,
    };

    /** Parse an atom
     * Atoms are operators such as `>>>` or `+`
     * @member Parser.Lexer
     * @private
     * @returns {Parser.Token}  A token representing the atom
     */
    function parseAtom() {
        let ch = peekNextChar();
        if (ch == '<') {
            skipChar();
            ch = peekNextChar();
            if (ch == '<') {
                skipChar();
                return createToken(Parser.TokenType.LogicalShiftLeft);
            } else if (ch == '=') {
                skipChar();
                return createToken(Parser.TokenType.LessEqual);
            } else {
                return createToken(Parser.TokenType.LessThan);
            }
        } else if (ch == '>') {
            skipChar();
            ch = peekNextChar();
            if (ch == '>') {
                skipChar();
                ch = peekNextChar();
                if (ch == '>') {
                    skipChar();
                    return createToken(Parser.TokenType.LogicalShiftRight);
                } else {
                    return createToken(Parser.TokenType.ArithmeticShiftRight);
                }
            } else if (ch == '=') {
                skipChar();
                return createToken(Parser.TokenType.GreaterEqual);
            } else {
                return createToken(Parser.TokenType.GreaterThan);
            }
        } else if (ch == '|') {
            skipChar();
            ch = peekNextChar();
            if (ch == '|') {
                skipChar();
                return createToken(Parser.TokenType.LogicalOR);
            } else {
                return createToken(Parser.TokenType.BitwiseOR);
            }
        } else if (ch == '&') {
            skipChar();
            ch = peekNextChar();
            if (ch == '&') {
                skipChar();
                return createToken(Parser.TokenType.LogicalAND);
            } else {
                return createToken(Parser.TokenType.BitwiseAND);
            }
        } else if (ch == '=') {
            skipChar();
            ch = peekNextChar();
            if (ch == '=') {
                skipChar();
                return createToken(Parser.TokenType.Equals);
            } else {
                return createToken(Parser.TokenType.Assignment);
            }
        } else if (ch == '!') {
            skipChar();
            ch = peekNextChar();
            if (ch == '=') {
                skipChar();
                return createToken(Parser.TokenType.NotEquals);
            } else {
                return createToken(Parser.TokenType.LogicalNOT);
            }
        } else if (ch in atomTypes) {
            skipChar();
            return createToken(atomTypes[ch]);
        }
        error('Unknown token');
    }

    /** Get the next token and advance the input position
     * @returns {Parser.Token} the next token from the input stream,
     *          or an end-of-string token if the end of the string has been reached
     */
    this.next = function() {
        skipSpaces();

        startToken();
        if (endOfString()) {
            return createToken(Parser.TokenType.EndOfString);
        }

        let ch = peekNextChar();
        if (ch == '$') {
            return parseRegister();
        } else if (isDigit(ch)) {
            return parseNumber();
        } else if (isIdentifierStart(ch)) {
            return parseIdentifier();
        } else {
            return parseAtom();
        }
    }
}