digitalmars.D.learn - Recursive-descent parsing
- Andrew Edwards (352/352) Dec 24 2016 The authors of "The Art of Java" present, as a first coding
- Stefan Koch (17/369) Dec 24 2016 All of the performance characteristics crucially depend on the
The authors of "The Art of Java" present, as a first coding example, a recursive-descent parser to demonstrate Java's ability to facilitate low level programming commonly performed in C and C++. I took the opportunity to port the code to D. By doing this, I now have an understanding of how a recursive-descent parser is written and works. What I am now interested in, is learning about what makes this particular implementation a poor or strong one, how to improve upon it and other, possibly superior, implementations made possible by D. Comments are invited and appreciated. Andrew Edwards If at first you don't succeed... ========================= rdp2.d ========================= //module rdp2; class ParserException : Throwable { this(string msg) { super(msg); } } // These are the token types. enum { NONE, DELIMITER, VARIABLE, NUMBER } // These are the types of syntax errors. enum { SYNTAX, UNBALPARENS, NOEXP, DIVBYZERO } // This token indecates end-of-expression. enum EOE = ";"; string exp; // refers to expression string int ndx; // current index into the expression string token; // holds current token int tokType; // holds token's types // Array for variables. double[] vars = new double[26]; // Parser entry poing. double evaluate(string expstr) { double result; exp = expstr; ndx = 0; getToken(); if (token == EOE) handleErr(NOEXP); // no expression present // Parse and evaluate the expression. result = evalExp1(); if (token != EOE) handleErr(SYNTAX); // last token must be EOE return result; } // Process an assignment. double evalExp1() { double result; int varNdx; int ttokType; string tmpToken; if (tokType == VARIABLE) { // save old token tmpToken = token; ttokType = tokType; // Compute the index of the variable. import std.ascii: toUpper; varNdx = token[0].toUpper - 'A'; getToken(); if (token != "=") { putBack(); // return current token // restore old token -- not an assignment token = tmpToken; tokType = ttokType; } else { getToken(); // get next part of exp result = evalExp2(); vars[varNdx] = result; return result; } } return evalExp2(); } // Add or subtract two terms double evalExp2() { char op; double result; double partialResult; result = evalExp3(); while ((op = token[0]) == '+' || op == '-') { getToken(); partialResult = evalExp3(); final switch (op) { case '-': { result -= partialResult; break; } case '+': { result += partialResult; break; } } } return result; } // Multiply or divide two factors double evalExp3() { char op; double result; double partialResult; result = evalExp4(); while ((op = token[0]) == '*' || op == '/' || op == '%') { getToken(); partialResult = evalExp4(); final switch (op) { case '*': { result *= partialResult; break; } case '/': { if (partialResult == 0.0) handleErr(DIVBYZERO); result /= partialResult; break; } case '%': { if (partialResult == 0.0) handleErr(DIVBYZERO); result %= partialResult; break; } } } return result; } // Process an exponent. double evalExp4() { double result; double partialResult; double ex; result = evalExp5(); if (token == "^") { getToken(); partialResult = evalExp4(); ex = result; if (partialResult == 0.0) { result = 1.0; } else { for (int t = cast(int)partialResult-1; t > 0; t--) result *= ex; } } return result; } // Evaluate a unary + or - double evalExp5() { double result; string op; op = null; if ((tokType == DELIMITER) && token == "+" || token == "-") { op = token; getToken(); } result = evalExp6(); //if (op == "-") result = -result; return (op == "-") ? -result : result; } // Process a parenthesized expression. double evalExp6() { double result; if (token == "(") { getToken(); result = evalExp2(); if (token != ")") handleErr(UNBALPARENS); getToken(); } else result = atom(); return result; } // Get the value of a number. double atom() { double result = 0.0; switch (tokType) { case NUMBER: try { import std.conv: parse; result = token.parse!double; } catch (Exception e) { handleErr(SYNTAX); } getToken(); break; case VARIABLE: result = findVar(token); getToken(); break; default: handleErr(SYNTAX); break; } return result; } // Return the value of a variable. double findVar(string vname) { import std.ascii: isAlpha, toUpper; if (!vname[0].isAlpha) { handleErr(SYNTAX); return 0.0; } return vars[(vname[0] - 'A').toUpper]; } // Return a token to the input stream. void putBack() { if (token == EOE) return; foreach(i; 0 .. token.length) ndx--; } // Handle an error. void handleErr(int error) { string[] err = [ "Syntax Error", "Unbalanced Parentheses", "No Expression Present", "Division by Zero" ]; throw new ParserException(err[error]); } // Obtain the next token. void getToken() { import std.ascii: isAlpha, isDigit, isWhite; tokType = NONE; token = null; // Check for end of expression if (ndx == exp.length) { token = EOE.dup; return; } // Skip over white space while (ndx < exp.length && exp[ndx].isWhite) ++ndx; // Trailing whitespace ends expression. if (ndx == exp.length) { token = EOE.dup; return; } if (exp[ndx].isDelim) // is operator { token ~= exp[ndx]; ndx++; tokType = DELIMITER; } else if (exp[ndx].isAlpha) // is variable { while (!exp[ndx].isDelim) { token ~= exp[ndx]; ndx++; if (ndx >= exp.length) break; } tokType = VARIABLE; } else if (exp[ndx].isDigit) // is number { while (!exp[ndx].isDelim) { token ~= exp[ndx]; ndx++; if (ndx >= exp.length) break; } tokType = NUMBER; } else // unknown character terminates expression { token = EOE.dup; return; } } // Returns true if c is a delimiter. bool isDelim(char c) { import std.algorithm: canFind; return canFind(" +-/*%^=()", c); } ========================= demordp.d ========================= mixin(import("rdp2.d")); void main(string[] args) { string expr; import std.stdio: write, writeln, readln; writeln("Enter an empty expression to stop."); for(;;) { write("Enter an expression: "); expr = readln(); if (expr == "\n") break; try { writeln("Result: ", expr.evaluate); } catch (ParserException e) { e.msg.writeln; } } }
Dec 24 2016
On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwards wrote:The authors of "The Art of Java" present, as a first coding example, a recursive-descent parser to demonstrate Java's ability to facilitate low level programming commonly performed in C and C++. I took the opportunity to port the code to D. By doing this, I now have an understanding of how a recursive-descent parser is written and works. What I am now interested in, is learning about what makes this particular implementation a poor or strong one, how to improve upon it and other, possibly superior, implementations made possible by D. Comments are invited and appreciated. Andrew Edwards If at first you don't succeed... ========================= rdp2.d ========================= //module rdp2; class ParserException : Throwable { this(string msg) { super(msg); } } // These are the token types. enum { NONE, DELIMITER, VARIABLE, NUMBER } // These are the types of syntax errors. enum { SYNTAX, UNBALPARENS, NOEXP, DIVBYZERO } // This token indecates end-of-expression. enum EOE = ";"; string exp; // refers to expression string int ndx; // current index into the expression string token; // holds current token int tokType; // holds token's types // Array for variables. double[] vars = new double[26]; // Parser entry poing. double evaluate(string expstr) { double result; exp = expstr; ndx = 0; getToken(); if (token == EOE) handleErr(NOEXP); // no expression present // Parse and evaluate the expression. result = evalExp1(); if (token != EOE) handleErr(SYNTAX); // last token must be EOE return result; } // Process an assignment. double evalExp1() { double result; int varNdx; int ttokType; string tmpToken; if (tokType == VARIABLE) { // save old token tmpToken = token; ttokType = tokType; // Compute the index of the variable. import std.ascii: toUpper; varNdx = token[0].toUpper - 'A'; getToken(); if (token != "=") { putBack(); // return current token // restore old token -- not an assignment token = tmpToken; tokType = ttokType; } else { getToken(); // get next part of exp result = evalExp2(); vars[varNdx] = result; return result; } } return evalExp2(); } // Add or subtract two terms double evalExp2() { char op; double result; double partialResult; result = evalExp3(); while ((op = token[0]) == '+' || op == '-') { getToken(); partialResult = evalExp3(); final switch (op) { case '-': { result -= partialResult; break; } case '+': { result += partialResult; break; } } } return result; } // Multiply or divide two factors double evalExp3() { char op; double result; double partialResult; result = evalExp4(); while ((op = token[0]) == '*' || op == '/' || op == '%') { getToken(); partialResult = evalExp4(); final switch (op) { case '*': { result *= partialResult; break; } case '/': { if (partialResult == 0.0) handleErr(DIVBYZERO); result /= partialResult; break; } case '%': { if (partialResult == 0.0) handleErr(DIVBYZERO); result %= partialResult; break; } } } return result; } // Process an exponent. double evalExp4() { double result; double partialResult; double ex; result = evalExp5(); if (token == "^") { getToken(); partialResult = evalExp4(); ex = result; if (partialResult == 0.0) { result = 1.0; } else { for (int t = cast(int)partialResult-1; t > 0; t--) result *= ex; } } return result; } // Evaluate a unary + or - double evalExp5() { double result; string op; op = null; if ((tokType == DELIMITER) && token == "+" || token == "-") { op = token; getToken(); } result = evalExp6(); //if (op == "-") result = -result; return (op == "-") ? -result : result; } // Process a parenthesized expression. double evalExp6() { double result; if (token == "(") { getToken(); result = evalExp2(); if (token != ")") handleErr(UNBALPARENS); getToken(); } else result = atom(); return result; } // Get the value of a number. double atom() { double result = 0.0; switch (tokType) { case NUMBER: try { import std.conv: parse; result = token.parse!double; } catch (Exception e) { handleErr(SYNTAX); } getToken(); break; case VARIABLE: result = findVar(token); getToken(); break; default: handleErr(SYNTAX); break; } return result; } // Return the value of a variable. double findVar(string vname) { import std.ascii: isAlpha, toUpper; if (!vname[0].isAlpha) { handleErr(SYNTAX); return 0.0; } return vars[(vname[0] - 'A').toUpper]; } // Return a token to the input stream. void putBack() { if (token == EOE) return; foreach(i; 0 .. token.length) ndx--; } // Handle an error. void handleErr(int error) { string[] err = [ "Syntax Error", "Unbalanced Parentheses", "No Expression Present", "Division by Zero" ]; throw new ParserException(err[error]); } // Obtain the next token. void getToken() { import std.ascii: isAlpha, isDigit, isWhite; tokType = NONE; token = null; // Check for end of expression if (ndx == exp.length) { token = EOE.dup; return; } // Skip over white space while (ndx < exp.length && exp[ndx].isWhite) ++ndx; // Trailing whitespace ends expression. if (ndx == exp.length) { token = EOE.dup; return; } if (exp[ndx].isDelim) // is operator { token ~= exp[ndx]; ndx++; tokType = DELIMITER; } else if (exp[ndx].isAlpha) // is variable { while (!exp[ndx].isDelim) { token ~= exp[ndx]; ndx++; if (ndx >= exp.length) break; } tokType = VARIABLE; } else if (exp[ndx].isDigit) // is number { while (!exp[ndx].isDelim) { token ~= exp[ndx]; ndx++; if (ndx >= exp.length) break; } tokType = NUMBER; } else // unknown character terminates expression { token = EOE.dup; return; } } // Returns true if c is a delimiter. bool isDelim(char c) { import std.algorithm: canFind; return canFind(" +-/*%^=()", c); } ========================= demordp.d ========================= mixin(import("rdp2.d")); void main(string[] args) { string expr; import std.stdio: write, writeln, readln; writeln("Enter an empty expression to stop."); for(;;) { write("Enter an expression: "); expr = readln(); if (expr == "\n") break; try { writeln("Result: ", expr.evaluate); } catch (ParserException e) { e.msg.writeln; } } }All of the performance characteristics crucially depend on the language you want to parse. In general you want to match as many tokes at as you can in a single parse-tree node. Since that will reduce the number of function calls you have to make. You want to preallocate buffers. Use the append-functionality sparingly since that involves a call into a shared library. Avoid or encapsulate global state if you can since it can get tricky to mange. Other then that there is not much general advice I can give. Try to tag your parse-nodes since then you can match patterns using a nested switch, which is one of the cleanest routes to do it.
Dec 24 2016