Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grammar to parse tree-siter grammars #147

Open
mingodad opened this issue Dec 14, 2021 · 0 comments
Open

Grammar to parse tree-siter grammars #147

mingodad opened this issue Dec 14, 2021 · 0 comments
Labels
enhancement New feature or request

Comments

@mingodad
Copy link
Contributor

People writing grammars for https://github.com/tree-sitter/tree-sitter need to use a ugly JavaScript spaghetti to write grammars and convert then to a json that the actual parser uses to do it's job.

I've been looking for a simple way to write the grammars in a style like yacc/bison/peg and then the tool would write the final json for the tree-sitter parser, I think that PeppaPEG would be a nice fit for this job and for that we need read the actual json grammars and convert to an augmented EBNF then people would write/extend on that EBNF and the tool would again write back the json.

A feature like this one would improve the visibility and user base of PeppaPEG.

Example here is a Lua grammar for tree-sitter https://github.com/Azganoth/tree-sitter-lua/blob/master/grammar.js and here the final json https://github.com/Azganoth/tree-sitter-lua/blob/master/src/grammar.json and here an initial EBNF grammar :

%name lua
%extras
	comment
	/[\s\n]/
	;
%conflicts
	[ _prefix ]
	[ _expression _variable_declarator ]
	[ _expression function_call_statement ]
	[ function_name function_name_field ]
	;
%externals
	comment
	string
	;
%inline
	_statement
	;
program :  _statement*  return_statement?  ;
return_statement :  "return"  (  _expression (  ","  _expression ) *  ) ?  _empty_statement?  ;
_statement : expression |  variable_declaration |  local_variable_declaration |  do_statement |  if_statement |  while_statement |  repeat_statement |  for_statement |  for_in_statement |  goto_statement |  break_statement |  label_statement |  _empty_statement | function | local_function | function_call ;
variable_declaration :  ( variable_declarator (  "," variable_declarator ) *  )  "="  (  _expression (  ","  _expression ) *  )  ;
local_variable_declaration :  "local" variable_declarator (  "="  (  _expression (  ","  _expression ) *  )  ) ?  ;
_variable_declarator :  identifier |  (  _prefix "["  _expression "]"  )  |  field_expression ;
field_expression :  _prefix "." property_identifier ;
_local_variable_declarator :  identifier (  ","  identifier ) *  ;
do_statement :  "do"  _statement*  return_statement?  "end"  ;
if_statement :  "if" condition_expression "then"  _statement*  return_statement?  elseif*  else?  "end"  ;
elseif :  "elseif" condition_expression "then"  _statement*  return_statement?  ;
else :  "else"  _statement*  return_statement?  ;
while_statement :  "while" condition_expression "do"  _statement*  return_statement?  "end"  ;
repeat_statement :  "repeat"  _statement*  return_statement?  "until" condition_expression ;
for_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
for_in_statement :  "for" loop_expression "do"  _statement*  return_statement?  "end"  ;
_loop_expression :  identifier "="  _expression ","  _expression (  ","  _expression ) ?  ;
_in_loop_expression :  (  identifier (  ","  identifier ) *  )  "in"  (  _expression (  ","  _expression ) *  )  ;
goto_statement :  "goto"  identifier ;
break_statement :  "break"  ;
label_statement :  "::"  identifier "::"  ;
_empty_statement :  ";"  ;
function_statement :  "function"  function_name _function_body ;
local_function_statement :  "local"  "function"  identifier _function_body ;
function_call_statement :   (  ( (  _prefix arguments )  |  (  _prefix ":" method arguments )  )  )  ;
arguments :  (  "("  (  _expression (  ","  _expression ) *  ) ?  ")"  )  |  table |  string ;
function_name :  ( identifier |  function_name_field )  (  ":" method ) ?  ;
function_name_field :  (  identifier )  (  "." property_identifier ) *  ;
parameters :  "("  (  ( self |  spread |  identifier )  (  ","  identifier ) *  (  ","  spread ) ?  ) ?  ")"  ;
_function_body :  parameters _statement*  return_statement?  "end"  ;
_expression :  spread |  _prefix |  next |  function_definition |  table |  binary_operation |  unary_operation |  string |  number |  nil |  true |  false |  identifier ;
spread :  "..."  ;
self :  "self"  ;
next :  "next"  ;
global_variable :  "_G"  |  "_VERSION"  ;
_prefix :  self |  global_variable |  _variable_declarator |   ( function_call )  |  (  "("  _expression ")"  )  ;
function_definition :  "function"  _function_body ;
table :  "{"  _field_sequence?  "}"  ;
field :  (  "["  _expression "]"  "="  _expression )  |  (  identifier "="  _expression )  |  _expression ;
_field_sequence :   (  (  field (  _field_sep field ) *  _field_sep?  )  )  ;
_field_sep :  ","  |  ";"  ;
binary_operation :   (  (  _expression "or"  _expression )  )  |   (  (  _expression "and"  _expression )  )  |   (  (  _expression "<"  _expression )  )  |   (  (  _expression "<="  _expression )  )  |   (  (  _expression "=="  _expression )  )  |   (  (  _expression "~="  _expression )  )  |   (  (  _expression ">="  _expression )  )  |   (  (  _expression ">"  _expression )  )  |   (  (  _expression "|"  _expression )  )  |   (  (  _expression "~"  _expression )  )  |   (  (  _expression "&"  _expression )  )  |   (  (  _expression "<<"  _expression )  )  |   (  (  _expression ">>"  _expression )  )  |   (  (  _expression "+"  _expression )  )  |   (  (  _expression "-"  _expression )  )  |   (  (  _expression "*"  _expression )  )  |   (  (  _expression "/"  _expression )  )  |   (  (  _expression "//"  _expression )  )  |   (  (  _expression "%"  _expression )  )  |   (  (  _expression ".."  _expression )  )  |   (  (  _expression "^"  _expression )  )  ;
unary_operation :   (  (  ( "not"  |  "#"  |  "-"  |  "~"  )  _expression )  )  ;
number :  (  ( ( (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  "."  /[0-9]+/?  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  "."  /[0-9]+/ (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  |  (  ( "0"  |  (  "0" ?  /[1-9]/ /[0-9]+/?  )  )  (  ( "e"  |  "E"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  |  (  ( "0x"  |  "0X"  )  /[a-fA-F0-9]+/ (  "."  /[a-fA-F0-9]+/ ) ?  (  ( "p"  |  "P"  )  (  ( "-"  |  "+"  ) ?  /[0-9]+/ )  ) ?  )  )  )  ;
nil :  "nil"  ;
true :  "true"  ;
false :  "false"  ;
identifier :  /[a-zA-Z_][a-zA-Z0-9_]*/ ;

Generated by this script (using quickjs https://bellard.org/quickjs/):

import * as std from "std";
import * as os from "os";

//print(typeof process);
//if(typeof std == "undefined") {
//}

const fname_list = [
	"tree-sitter-lua/src/grammar.json",
];

//const fname_base = "A_grammars/tree-sitter/";
const fname_base = "../tree-sitter/";

function parseJsonGrammar(fname)
{
	let fd = std.open(fname_base + fname, "r");
	let json = fd.readAsString();
	fd.close();

	let out_fname = fname.replace(/\/.+/, ".ebnf0");
	//print(out_fname); return;
	//fd = std.open(out_fname, "w");
	fd = std.open("/dev/stdout", "w"); //std.stdout;

	json = JSON.parse(json);
	//print(json);

	fd.printf("%%name %s\n", json.name);
	if(json.word) fd.printf("%%word %s\n", json.word);

	let manageTuples = function(dict) {
		for(var idx in  dict) {
			let value =  dict[idx];
			//print(value);
			switch(value.type) {
				case "SYMBOL":
					fd.printf("\t%s\n", value.name);
					break;
				case "PATTERN":
					fd.printf("\t/%s/\n", value.value);
					break;
				case "STRING":
					fd.printf("\t'%s'\n", value.value);
					break;
				case "TOKEN":
					fd.printf("\t'%s'\n", value.content);
					break;
				default:
					throw("Unmanaged type " + value.type);
			}
		}
	};

	let manageArrays = function(ary) {
		for(var idx in  ary) {
			let value =  ary[idx];
			fd.printf("\t[");
			for(var idx2 in value) {
				fd.printf(" %s", value[idx2]);
			}
			fd.printf(" ]\n");
		}
	};

	if( json.extras && json.extras.length ) {
		fd.printf("%%extras\n");
		manageTuples(json.extras);
		fd.printf("\t;\n");
	}

	if( json.conflicts && json.conflicts.length ) {
		fd.printf("%%conflicts\n");
		manageArrays(json.conflicts);
		fd.printf("\t;\n");
	}

	if( json.externals && json.externals.length ) {
		fd.printf("%%externals\n");
		manageTuples(json.externals);
		fd.printf("\t;\n");
	}

	if( json.precedences  && json.precedences.length) {
		fd.printf("%%precedences\n");
		for(var idx in  json.precedences) {
			let value =  json.precedences[idx];
			fd.printf("\t[");
			for(var idx2 in value) {
				let pval = value[idx2];
				switch(pval.type) {
					case "SYMBOL":
						fd.printf(" %s", pval.name);
						break;
					case "STRING":
						fd.printf(" '%s'", pval.value);
						break;
					default:
						throw("Unmanaged type " + pval.type);
				}
			}
			fd.printf(" ]\n");
		}
		fd.printf("\t;\n");
	}

	if( json.inline  && json.inline.length ) {
		fd.printf("%%inline\n");
		for(var idx in  json.inline) {
			let value =  json.inline[idx];
			fd.printf("\t%s\n", value);
		}
		fd.printf("\t;\n");
	}

	if( json.supertypes  && json.supertypes.length ) {
		fd.printf("%%supertypes\n");
		for(var idx in  json.supertypes) {
			let value =  json.supertypes[idx];
			fd.printf("\t%s\n", value);
		}
		fd.printf("\t;\n");
	}

	let manageAlias = function (rules, alias_list) {
		for(var idx in  rules) {
			let rhs = rules[idx2];
			for(var idx2 in  rhs) {

			}
		}
	};
	let alias_list = {};

	let str_list = [];

	let manageRule = function (name, rule, depth) {
		//print(name, rule.type); //, typeof rule);
		switch(rule.type)
		{
			case "ALIAS":
				//fd.printf(" ( ");
				//manageRule(rule.type, rule.content, depth+1);
				//fd.printf(" )@%s ", rule.value);
				fd.printf("%s", rule.value);
				//alias_list[rule.value] = ;
			break;
			case "BLANK":
				print(rule.type);
			break;
			case "CHOICE": {
				let members = rule.members;
				let mcount = members.length;
				let isOptional = mcount == 2 && members[1].type == "BLANK";
				if(isOptional) {
					//if(mcount > 2 && depth) fd.printf(" (");
					manageRule(rule.type, members[0], depth+1);
					//if(mcount > 2 && depth) fd.printf(" )");
					fd.printf("? ");
				}
				else
				{
					//fd.printf("=== %d : %d : %d ===", mcount, depth, mcount > 1 && depth);
					if(mcount > 1 && depth) fd.printf(" (");
					for(var idx in members) {
						if(idx > 0) fd.printf(" | ");
						manageRule(rule.type, members[idx], depth+1);
					}
					if(mcount > 1 && depth) fd.printf(" ) ");
				}
			}
			break;
			case "FIELD":
				//print(rule.type, rule.name);
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "IMMEDIATE_TOKEN":
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "PATTERN": {
				fd.printf(" /%s/", rule.value);
			}
			break;
			case "PREC":
			case "PREC_DYNAMIC":
			case "PREC_LEFT":
			case "PREC_RIGHT":
				fd.printf("  ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;
			case "REPEAT":
				//if(depth) fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				//if(depth) fd.printf(" )");
				fd.printf("* ");
			break;
			case "REPEAT1":
				//if(depth) fd.printf(" (");
				manageRule(rule.type, rule.content, depth+1);
				//if(depth) fd.printf(" )");
				fd.printf("+ ");
			break;
			case "SEQ": {
				let members = rule.members;
				let mcount = members.length;
				//fd.printf("=== %d : %d ===", mcount, depth);
				if(mcount > 1 && depth) fd.printf(" ( ");
				for(var idx in members) {
					manageRule(rule.type, members[idx], depth+1);
				}
				if(mcount > 1 && depth) fd.printf(" ) ");
			}
			break;

			case "STRING": {
				let value = rule.value;
				//print(rule.type, value);
				switch(value) {
					case "\0": fd.printf(" '\\0' "); break;
					case "\b": fd.printf(" '\\b' "); break;
					case "\f": fd.printf(" '\\f' "); break;
					case "\n": fd.printf(" '\\n' "); break;
					case "\r": fd.printf(" '\\r' "); break;
					case "\t": fd.printf(" '\\t' "); break;
					case "\v": fd.printf(" '\\v' "); break;
					case "\\": fd.printf(" '\\\\' "); break;
					case "'": fd.printf(" \"'\" "); break;
					case "\"": fd.printf(" '\"' "); break;
					default:
						//value = value.replace(/\\/g, "\\\\");
						value = value.replace(/\t/g, "\\t"); //order matter
						if(value.indexOf("'") >= 0) fd.printf(" \"%s\" ", value);
						else fd.printf(" \"%s\" ", value);
				}
			}
			break;

			case "SYMBOL":
				//print(rule.type, rule.name);
				fd.printf(" %s", rule.name);
			break;

			case "TOKEN":
				fd.printf(" ( ");
				manageRule(rule.type, rule.content, depth+1);
				fd.printf(" ) ");
			break;

			default:
				throw("Unknown rule type: " + rule.type);
		}
	}

	let rules = json.rules;
	for(var idx in rules) {
		let rule = rules[idx];
		//print(rule);
		if(rule.type == "xTOKEN") {
			let str = idx + " : " + + " .";
		}
		else {
			fd.printf("%s : ", idx);
			manageRule(idx, rule, 0);
		}
		fd.printf(" ;\n");
	}
	fd.close();
}

for(let idx in fname_list)
{
	let fname = fname_list[idx];
	print(fname);
	parseJsonGrammar(fname);
}

@soasme soasme added the enhancement New feature or request label Dec 14, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

2 participants