<?php
namespace Tlf\Lexer2;
/**
* Parsers Directive code into an AST.
*/
class Parser {
/**
* Get an AST from parser code.
*
* @param $namespace string usually the programming/markup language, like 'php', or 'json', or 'javascript'
* @param $parser_code string Parser code (our custom language)
*
* @return array AST of the input $parser_code (namespace is discarded here)
*/
public function parse_directive_code(string $parser_code): array {
$structured_code = $this->get_structured_code($parser_code);
//print_r($structured_code);
//exit;
// what's the general parser strategy & rules?
// `[a-zA-Z]:` in global starts a directive block
// The next directive block declaration at the same indentation-level starts a new directive block
// `[a-zA-Z]:` in a directive block starts an instruction set
// The next instruction set declaration at the same indentation-level starts a new instruction set
// Anything at the next indentation-level is a command. Notes:
// - namespace:directive_name will activate a directive
// - :directive_name is a shorthand for the current namespace, and activates a directive
// - [a-zA-Z] ...arguments is an stdlib function call
// - [a-zA-Z].[a-zA-Z] ...arguments calls a function on an object in the program
// - !argument... denotes an argument that should be processed as a command
// - # starts a comment that continues until the end of the line
// - `command ...arguments:` the end-of-line colon means a key=>value array follows. (The array is a single argument, right?)
// the key=>value array must be at the next level of indentation
// - $1, $2, $3 ... reference previous matches, typically capture-groups from a previous buffer.match call in the current scope
// - Reserved instruction sets (is, if_started, if_unstarted) have specific uses & behaviors.
// - `namespace:directive.instruction_set` calls un-reserved instruction sets as functions, basically.
// - TODO: var_name=Some Value
# So, lets start by building an array of directives, ignoring instruction sets & commands
// Loop over the structure and:
// - parse commands
// - parse arguments
// - build a final AST of the parser code
// - Add the asts to the namespaced directives array property of this class.
$final = [];
foreach ($structured_code as $directive_name => $instruction_sets){
foreach ($instruction_sets as $instruction_set_name => $command_list){
foreach ($command_list as $index => $command_info){
$line = $command_info['line'];
$command = $this->parse_command_string($line, $command_info['array_arg'] ?? null);
print_r($command);
try {
$command_as_array = $command->toArray();
} catch (\Throwable $e){
echo "\n\nINTERNAL ERROR: Cannot convert command '$line' to array.\n\n";
throw $e;
}
//$this->directives[$namespace][$directive_name][$instruction_set_name][$index] = $command_as_array;
$final[$directive_name][$instruction_set_name][$index] = $command_as_array;
}
}
}
return $final;
}
/**
* Parse an array of key=>value declaration strings, such as:
* [
* 0=>'name=$1',
* 1=>'type=Method',
* ]
*
* @param $array_entry_declarations array<int index, string $declaration> where a declaration is like 'name=$1' or 'type=Method'
*
* @return array<string|int index_or_key, string|Command arg> array of arg values. Arg value may be a string or a Command.
*/
public function parse_array_entry_declarations(array $array_entry_declarations): array {
$ret_args = [];
foreach ($array_entry_declarations as $index => $declaration){
$parts = explode("=",$declaration,2);
//$arg = $parts[1];
$arg_list = $this->get_command_arguments($parts[1],null);
$arg = $arg_list[0];
if ($parts[0]==''){
$ret_args[] = $arg;
} else {
$ret_args[$parts[0]] = $arg;
}
}
return $ret_args;
}
/**
* Create a Command from a string of command code, like "buffer.match /regex/".
*
* @param $command_string string the text of a command's code, like "buffer.match /regex/"
* @param $array_entry_declarations ?array<int index, string $declaration> where a declaration is like 'name=$1' or 'type=Method'
*
* @return array<string key, mixed value> a Command w/ keys declaration, object, method, and args
*/
public function parse_command_string(string $command_string, ?array $array_entry_declarations): \Tlf\Lexer2\Command {
$command = new \Tlf\Lexer2\Command();
$command->declaration = $command_string;
$command->array_entry_declarations = $array_entry_declarations;
//$command = [
//'declaration' => $command_string,
//'object'=>null, // string identifier referencing an object in the global scope (or maybe local scope too), not object reference
//'method'=>null, // string identifier, not callable reference
//'args'=>[],
//];
$line = trim($command_string);
//var_dump($line);
$first_space_pos = strpos($line, " ");
if ($first_space_pos===false)$first_space_pos = strlen($line);
$declared_command = substr($line, 0, $first_space_pos);
$args_list_string = substr($line, $first_space_pos+1);
$command->declared_command = "'$declared_command'";
// declared command can be:
// - stdlib_method # Note: You don't write stdlib_. Just the method name that is available in the stdlib.
// - object.method
// - object.property.method # Can we go deeper? object.property.property.method ?
// - (MAYBE) object.!method.submethod # Where object.!method returns an object & submethod is a method on that returned object.
// - :directive
// - namespace:directive
// - localvar=
$stdlib_reg = '/^([a-zA-Z_]+)$/';
$objmethod_reg = '/^([a-zA-Z_]+)\\.([a-zA-Z_]+)$/';
$nested_object_reg = '/^((?:(?:[a-zA-Z_]+)\\.){2})([a-zA-Z_]+)$/';
$directive_activation_reg = '/^([a-zA-Z_]+)?\:([a-zA-Z_]+)/';
$declare_localvar_reg = '/^([a-zA-Z_]+)=/';
if (preg_match($stdlib_reg,$declared_command,$matches)){
$command->object = 'StdLib';
$command->method = $matches[1];
} else if (preg_match($declare_localvar_reg, $declared_command, $matches)){
$command->object = 'StdLib';
$command->method = 'set_arg';
$command->args[] = $matches[1].' ';
//print_r($command);
//exit;
} else if (preg_match($objmethod_reg, $declared_command, $matches)){
$command->object = $matches[1];
$command->method = $matches[2];
} else if (preg_match($nested_object_reg, $declared_command, $matches)){
$command->object = substr($matches[1],0,-1); // remove the trailing dot
$command->method = $matches[2];
} else if (preg_match($directive_activation_reg, $declared_command, $matches)) {
$command->object = 'StdLib';
$command->method = 'activateDirective';
$command->args[] = $matches[1]=='' ? null : $matches[1];
$command->args[] = $matches[2];
}
// what does a command consist of?
// - An object to call
// - A method on that object
// - A list of arguments
//
// Notes:
// - The first non-breaking string is the command name
// - The command name may be a shorthand, and need to be converted into the long-hand form for the output command.
// - Ex: :pet_name becomes `Lexer.Activate_Directive pet_name`, or something to that effect
// - An argument `[a-zA-Z]=` is a shorthand COMMAND that declares a variable in the current scope (wait, scope exists? lol)
// - An argument that starts with `!` is a command that has to be parsed.
// How do you terminate a !command? Like if arg1 is `!command subarg` and there is also an arg2 ?
// I guess you need a way to terminate a `!` argument.
// but typically ! args should be the last one, so most code can be simpler & not require a termination char.
// What's the termination char?
// - If arg1 is an array, can there be an arg2?
// YES, but you'd have to declare a variable as an array & then pass the variable
$command->args = array_merge($command->args, $this->get_command_arguments($args_list_string, $array_entry_declarations));
return $command;
}
/**
* Get an array of arguments for a Command, from a declared list of arguments.
*
* @param $args_list_string string arguments. For command 'buffer.match /regex/', this will be string '/regex/'
* @param $array_entry_declarations ?array<int index, string $arg_declaration> declarations of arguments, like "name=$1" & "type=Person"
*
* @return array<int index, mixed value> array of arguments. Arguments are strings or Commmands.
*/
public function get_command_arguments(string $args_list_string, ?array $array_entry_declarations): array {
if (trim($args_list_string)=='')return [];
$args = explode(" ", $args_list_string);
$ret_args = [];
foreach ($args as $index=>$arg){
if (substr($arg,0,1)=='!'){
$sub_command_string = substr($arg,1)." ".implode(" ", array_slice($args,$index+1));
$sub_command = $this->parse_command_string($sub_command_string, null);
if ($array_entry_declarations!=null){
$sub_command->args[] = $this->parse_array_entry_declarations($array_entry_declarations);
}
$ret_args[] = $sub_command->toArray();
$array_entry_declarations = null;
break;
} else if (substr($arg,0,1)=='$'){
$sub_command = new \Tlf\Lexer2\Command();
$sub_command->declaration = $arg;
$sub_command->object = 'StdLib';
$sub_command->method = 'get_arg';
$sub_command->args = [substr($arg,1)];
$ret_args[] = $sub_command->toArray();
} else {
$ret_args[] = $arg;
}
}
if ($array_entry_declarations!==null){
$ret_args[] = $this->parse_array_entry_declarations($array_entry_declarations);
}
return $ret_args;
}
/**
* Parse the code into a structure, but don't parse commands themselves.
*
*
* @param $parser_code string
*
* @return array<string directive_name, array instruction_sets> structured array of code & arguments (commands and args remain as unparsed text)
* @var instruction_sets array<string instruction_set_name, array unparsed_commands>
* @var unparsed_commands array<int index, array command_info>
* @var command_info array<string key, mixed value> w/ key(line) = the command's line & ?key(array_arg) being array<int index, string $arg_string>
*
*/
public function get_structured_code(string $parser_code): array {
// Parse the code into a structure
// directive:
// instruction_set:
// command # comment
// command:
// key=value
// command:
// =value # zero-index
$directives = [];
$lines = explode("\n", $parser_code);
$initial_indent = 0;
foreach (str_split($lines[0]) as $initial_indent=>$char){
if ($char!=' ')break;
}
$state = 'global';
//$initial_indent = 4;
$discovered_indent = 0;
$discovered_cmd_indent = 0;
$discovered_array_arg_indent = 0;
foreach ($lines as $line){
$line_comment_regex = "/^\\s*\\#/";
if (preg_match($line_comment_regex, $line)){
// its a comment!
//var_dump($line);
continue;
}
$end_of_line_comment_regex = "/\s\#.*$/";
$line = preg_replace($end_of_line_comment_regex, "", $line);
$directive_regex = "/^\\s{{$initial_indent},{$initial_indent}}([a-zA-Z_]+)\\:/";
// we listen for a directive at current indent or an instruction at the next indent level
$minimum_indent = $initial_indent + 1;
$maximum_indent = $initial_indent + 11; // why 10? idk. Seems like enough.
if ($discovered_indent!=0){
$minimum_indent = $discovered_indent;
$maximum_indent = $discovered_indent;
}
$instruction_set_regex = "/^(\\s{{$minimum_indent},{$maximum_indent}})([a-zA-Z_]+)\\:/";
$cmd_indent_min = $maximum_indent + 1;
$cmd_indent_max = $maximum_indent + 11;
if ($discovered_cmd_indent != 0){
$cmd_indent_min = $discovered_cmd_indent;
$cmd_indent_max = $discovered_cmd_indent;
}
$command_regex = "/^(\\s{{$cmd_indent_min},{$cmd_indent_max}})([^\\s].*)$/";
$arr_indent_min = $cmd_indent_max + 1;
$arr_indent_max = $cmd_indent_max + 11;
if ($discovered_array_arg_indent!=0){
$arr_indent_min = $discovered_array_arg_indent;
$arr_indent_max = $discovered_array_arg_indent;
}
$array_arg_regex = "/^(\\s{{$arr_indent_min},{$arr_indent_max}})([^\\s].*)$/";
//var_dump($line);
if (preg_match($directive_regex, $line, $matches)){
$directive_name = $matches[1];
$directives[$directive_name] = [];
$discovered_indent = 0;
} else if (preg_match($instruction_set_regex, $line, $matches)){
//echo 'MATCH FOUND!';
$discovered_indent = strlen($matches[1]);
$instruction_set_name = $matches[2];
$directives[$directive_name][$instruction_set_name] = [];
$discovered_cmd_indent = 0;
} else if (preg_match($command_regex, $line, $matches)){
$command_index = count($directives[$directive_name][$instruction_set_name] ?? []);
$directives[$directive_name][$instruction_set_name][$command_index] =
[
'line'=>$matches[2],
//'array_arg'=>[],
];
$discovered_cmd_indent = strlen($matches[1]);
$discovered_array_arg_indent = 0;
} else if (preg_match($array_arg_regex, $line, $matches)
&& substr($directives[$directive_name][$instruction_set_name][$command_index]['line'],-1)==':'
){
$discovered_array_arg_indent = strlen($matches[1]);
$directives[$directive_name][$instruction_set_name][$command_index]['line']
= substr($directives[$directive_name][$instruction_set_name][$command_index]['line'],0,-1);
$directives[$directive_name][$instruction_set_name][$command_index]['array_arg'][] = $matches[2];
}
}
return $directives;
}
}