Parsers and Parser Combinators

Parsers and Parser Combinators

·

9 min read

A parser is simply a function which takes in a string, does some parsing job, and output the parsing result.

For example, below is simple parser which can do one simple job: parse the character a.

const a = s => {
    if (s.startsWith("a")) return true;
    else return false;
};

console.log(a("abc")); // true
console.log(a("bc"));  // false

What about character b or c or others? To make it more general, we can write a function char which can takes in any character and return a parser to parse that character.

const char = c => s => {
    if (s.startsWith(c)) return true;
    else return false;
};

const a = char("a");
const b = char("b");

console.log(a("abc")); // true
console.log(b("bc"));  // true

OK, it can create parsers to parse any character. But now it only can parse the first character of the input string, we want to specify the start index. And also, if the parsing succeeded, we want the parsing result, not just a boolean value. And if the parsing failed, then we want an error message.

Then, we come up with an object, which contains all the information.

function createInitState(data) {
    return {
        data: data,
        index: 0,
        error: null,
        value: null,
    };
}

We pass it to the parser and the parser should return the same structure so can pass the result to another parser. So our char function becomes this.

const char = c => state => {
    if (state.error) return state;

    const { index, data } = state;
    if (index >= data.length) {
        return {
            ...state,
            error: `not enough data to parse, index: ${index}, char: ${c}`
        };
    }

    const value = data.charAt(index);
    if (value !== c) {
        return {
            ...state,
            error: `parse error, index: ${index}, expect: ${c}, got: ${value}`
        };
    }

    return {
        ...state,
        index: index + 1,
        value
    };
};

OK, now we can parse a string like below.

const a = char("a");
const b = char("b");
const c = char("c");

let state = createInitState("abc");

state = a(state);
console.log(state);
// { data: 'abc', index: 1, error: null, value: 'a' }

state = b(state);
console.log(state);
// { data: 'abc', index: 2, error: null, value: 'b' }

state = c(state);
console.log(state);
// { data: 'abc', index: 3, error: null, value: 'c' }

As you read this, you may want to ask, yes, it can parse one character, what's the big deal? Well, now comes the parser combinators. A parser combinator is just a function, which accepts parsers as input and output a new parser. You can think that its job is to combine multiple parser together to make a more complicated one.

Let's build our first parser combinator. Instead of one specified character, we want to create a parser which can try mutiple parser, if any of them succeed, then return the result. If none of them succeed, then return an error. It's like the effect of or. Let's start with the most simple or, just takes in two parsers.

const _or = (p1, p2) => state => {
    if (state.error) return state;

    let nextState = p1(state);
    if (!nextState.error) return nextState;

    nextState = p2(state);
    if (!nextState.error) return nextState;

    return {
        ...state,
        error: `error or`,
    };
};

The code is very simple, it just try to use the input parsers as requested. Now we can create a parser to parse one character or another character.

const a = char("a");
const b = char("b");

const ab = _or(a, b);

console.log(ab(createInitState("abc")));
// { data: 'abc', index: 1, error: null, value: 'a' }

console.log(ab(createInitState("bc")));
// { data: 'bc', index: 1, error: null, value: 'b' }

Now our or function can only takes 2 parsers. Based on this, we can easily make it takes in any number of parsers.

const or = (...parsers) => state => {
    return parsers.reduce((p1, p2) => _or(p1, p2))(state);
};

As you can see, it's so simple. We just need to try one by one. With this new or function, we can create a digit parser, which could parse any digit number.

const zero = char("0");
const one = char("1");
const two = char("2");
const three = char("3");
const four = char("4");
const five = char("5");
const six = char("6");
const seven = char("7");
const eight = char("8");
const nine = char("9");

const digit = or(zero, one, two, three, four, five, six, seven, eight, nine);

console.log(digit(createInitState("123")));
console.log(digit(createInitState("23")));
console.log(digit(createInitState("3")));

// { data: '123', index: 1, error: null, value: '1' }
// { data: '23', index: 1, error: null, value: '2' }
// { data: '3', index: 1, error: null, value: '3' }

OK, enough with one character, we want to parse more character. Similar to function or, we can create a function and. Its job is to takes in multiple parsers and try to run the parsers one by one. If any of the parsers failed, then we return an error. If all of then succeed, then we return the combined result values.

// same thing as _or, we start with 2 parsers input _and
const _and = (p1, p2) => state => {
    if (state.error) return state;

    let p1State = p1(state);
    if (p1State.error) return p1State;

    let p2State = p2(p1State);
    if (p2State.error) return p2State;

    return {
        ...p2State,
        value: [p1State.value, p2State.value]
    };
};

// based on _and, we build this general and
const and = (...parsers) => state => {
    if (state.error) return state;

    const parser = parsers.reduce((p1, p2) => {
        return _and(p1, p2);
    });
    const nextState = parser(state);
    if (nextState.error) return nextState;

    nextState.value = nextState.value.flat(Infinity);
    return nextState;
};

With this and function, we can create a parser which could parse any specified string.

const h = char("h");
const e = char("e");
const l = char("l");
const o = char("o");

const hello = and(h, e, l, l, o);

console.log(hello(createInitState("hello123")));

// {
//     data: 'hello123',
//     index: 5,
//     error: null,
//     value: [ 'h', 'e', 'l', 'l', 'o' ]
// }

Let's dig deeper, now we only can create a specified number of characters. We want to create a parser combinator which should parse any number of characters as long as there is no error. For example, we can create a function many, which takes in only one parser. And the returned parser should try to use this one parser to parse the string endlessly until there is an error. It like the meta character * in regular expressions.

const many = parser => state => {
    if (state.error) return state;

    let value = [];
    let pre = state;
    while (true) {
        state = parser(pre);
        if (state.error) break;
        value.push(state.value);
        pre = state;
    }

    return {
        ...pre,
        error: null,
        value
    };
};

As you can see, with prior experience with or and and, this many should seems easy. It just use a while loop to do the greedy parsing like *.

With this many function, we can create a parser to parse a number, not just one character, a complete number.

const digits = many(digit);

console.log(digits(createInitState("123abc")));
// { data: '123abc', index: 3, error: null, value: [ '1', '2', '3' ] }

Because the many function could parse empty as well, so with digits we want to ensure there is at least one digit, so we could do this.

const digits = and(digit, many(digit))

It's like the effect of + in regular expressions. You can create a new parser combinator for this task too.

As you can see, now we can parse a number successfully. But the result value is not good. We want to number 123, not an array of [ '1', '2', '3' ]. Yes we can convert the result in the many function, but that is not a good idea. We can create a map function, which takes in a parser and a callback function, and use this callback function to convert the result.

const map = (parser, fn) => state => {
    if (state.error) return state;

    state = parser(state);
    if (state.error !== null) return state;

    state.value = fn(state.value);
    return state;
};

Now let's convert result to real numbers.

const digits = map(and(digit, many(digit)), value => Number(value.join("")));

console.log(digits(createInitState("123abc")));
// { data: '123abc', index: 3, error: null, value: 123 }

Well, these are basics of parser combinators. Let's try to use what we learn to solve some problems. The leetcode problem Ternary Expression Parser is a good example.

A ternary is like a simple version of if else statement. The most simple example is like "T?2:3". Because T stands for true, so we should return the value 2. Things may become complicated when these expressions can be nested like F?1:T?4:5, or a more complicated one F?T?F?7:F?F?F?3:F?F?0:1:0:6:1:0:5. Now let's see how to solve this problem by parser combinators.

We define some basic elements of this expression first.

const T = map(char("T"), value => ({ type: "bool", value }));
const F = map(char("F"), value => ({ type: "bool", value }));

const question = char("?");
const colon = char(":");

const digit = map(
    or(zero, one, two, three, four, five, six, seven, eight, nine),
    value => ({
        type: "digit",
        value,
    })
);

And then the ternary expression's structure should looks like below.

const ternary = map(
    and(
        or(T, F),
        question,
        or(ternary, digit, T, F),
        colon,
        or(ternary, digit, T, F),
    ),
    value => {
        return {
            type: "expression",
            value
        };
    }
);

Because the expression is a nested structure, so as you can see, we use a recursive approach to expression this structure.

But if we try to run this, we should get an error.

        or(ternary, digit, T, F),
           ^

ReferenceError: Cannot access 'ternary' before initialization

The error is obvious, we can't use ternary when initilizing ternary. So one solution is that we can create a lazy function to make this parser combinator do lazy execution.

const lazy = thunk => state => thunk()(state);

It's very similar to other parser combinators, but in this time, it takes in a thunk function, which is used to return the real parser.

Now we can make a parser combinator runs lazily.

lazy(() => or(a, b));

As you can see, in this time, even if the lazy function is called immediately, the or function will not be called until passing the state to do the realing parsing job.

So now the tenary parser combinator should looks like below.

const ternary = map(
    and(
        or(T, F),
        question,
        lazy(() => or(ternary, digit, T, F)),
        colon,
        lazy(() => or(ternary, digit, T, F)),
    ),
    value => {
        return {
            type: "expression",
            value
        };
    }
);

Now let's use it to parse a ternary expression F?1:T?4:5.

var parseTernary = function (expression) {
    let state = {
        data: expression,
        index: 0,
        error: null,
        value: null,
    };

    state = ternary(state);
    console.log(JSON.stringify(state, null, 2))
};

console.log(parseTernary("F?1:T?4:5"));

/**

{
  "data": "F?1:T?4:5",
  "index": 9,
  "error": null,
  "value": {
    "type": "expression",
    "value": [
      {
        "type": "bool",
        "value": "F"
      },
      "?",
      {
        "type": "digit",
        "value": "1"
      },
      ":",
      {
        "type": "expression",
        "value": [
          {
            "type": "bool",
            "value": "T"
          },
          "?",
          {
            "type": "digit",
            "value": "4"
          },
          ":",
          {
            "type": "digit",
            "value": "5"
          }
        ]
      }
    ]
  }
}

*/

With this parsing result, we can evaluate the result easily.

const evaluate = node => {
    switch (node.type) {
        case "expression":
            return evaluate(node.value[0]) === "T" ?
                evaluate(node.value[2]) : evaluate(node.value[4]);
        default:
            return node.value;
    }
};

Finally, we solve the problem.

var parseTernary = function (expression) {
    let state = {
        data: expression,
        index: 0,
        error: null,
        value: null,
    };

    state = ternary(state);
    return evaluate(state.value);
};

console.log(parseTernary("F?1:T?4:5")); // 4
console.log(parseTernary("F?T?F?7:F?F?F?3:F?F?0:1:0:6:1:0:5")); // 5