Parser.php

<?php

namespace Tlf\Lexer2;

/**
 * Parsers Directive code into an AST.
 */
class Parser {

    /**
     * Get an AST from parser code.
     *
     * @param $namespace string usually the programming/markup language, like 'php', or 'json', or 'javascript'
     * @param $parser_code string Parser code (our custom language)
     *
     * @return array AST of the input $parser_code (namespace is discarded here)
     */
    public function parse_directive_code(string $parser_code): array {
        $structured_code = $this->get_structured_code($parser_code);

        //print_r($structured_code);
        //exit;
        // what's the general parser strategy & rules?
        // `[a-zA-Z]:` in global starts a directive block
        // The next directive block declaration at the same indentation-level starts a new directive block
            // `[a-zA-Z]:` in a directive block starts an instruction set
            // The next instruction set declaration at the same indentation-level starts a new instruction set
                // Anything at the next indentation-level is a command. Notes: 
                // - namespace:directive_name will activate a directive
                // - :directive_name is a shorthand for the current namespace, and activates a directive
                // - [a-zA-Z] ...arguments is an stdlib function call
                // - [a-zA-Z].[a-zA-Z] ...arguments calls a function on an object in the program
                // - !argument... denotes an argument that should be processed as a command
                // - # starts a comment that continues until the end of the line
                // - `command ...arguments:` the end-of-line colon means a key=>value array follows. (The array is a single argument, right?)
                    // the key=>value array must be at the next level of indentation
                // - $1, $2, $3 ... reference previous matches, typically capture-groups from a previous buffer.match call in the current scope
                // - Reserved instruction sets (is, if_started, if_unstarted) have specific uses & behaviors.
                // - `namespace:directive.instruction_set` calls un-reserved instruction sets as functions, basically.
                // - TODO: var_name=Some Value


        # So, lets start by building an array of directives, ignoring instruction sets & commands

        // Loop over the structure and:
            // - parse commands
            // - parse arguments
            // - build a final AST of the parser code
            // - Add the asts to the namespaced directives array property of this class.
        $final = [];
        foreach ($structured_code as $directive_name => $instruction_sets){
            foreach ($instruction_sets as $instruction_set_name => $command_list){
                foreach ($command_list as $index => $command_info){
                    $line = $command_info['line'];

                    $command = $this->parse_command_string($line, $command_info['array_arg'] ?? null);

                    print_r($command);
                    try {
                    $command_as_array = $command->toArray();
                    } catch (\Throwable $e){
                        echo "\n\nINTERNAL ERROR: Cannot convert command '$line' to array.\n\n";
                        throw $e;
                    }
                    //$this->directives[$namespace][$directive_name][$instruction_set_name][$index] = $command_as_array;
                    $final[$directive_name][$instruction_set_name][$index] = $command_as_array;
                }
            }
        }

        return $final;
    }


    /**
     * Parse an array of key=>value declaration strings, such as:
     * [
     *    0=>'name=$1',
     *    1=>'type=Method',
     * ]
     *
     * @param $array_entry_declarations array<int index, string $declaration> where a declaration is like 'name=$1' or 'type=Method'
     *
     * @return array<string|int index_or_key, string|Command arg> array of arg values. Arg value may be a string or a Command.
     */
    public function parse_array_entry_declarations(array $array_entry_declarations): array {

        $ret_args = [];
        foreach ($array_entry_declarations as $index => $declaration){
            $parts = explode("=",$declaration,2);
            //$arg = $parts[1];
            $arg_list = $this->get_command_arguments($parts[1],null);
            $arg = $arg_list[0];
            if ($parts[0]==''){
                $ret_args[] = $arg;
            } else {
                $ret_args[$parts[0]] = $arg;
            }
        }

        return $ret_args;
    }

    /**
     * Create a Command from a string of command code, like "buffer.match /regex/".
     * 
     * @param $command_string string the text of a command's code, like "buffer.match /regex/"
     * @param $array_entry_declarations ?array<int index, string $declaration> where a declaration is like 'name=$1' or 'type=Method'
     *
     * @return array<string key, mixed value> a Command w/ keys declaration, object, method, and args
     */
    public function parse_command_string(string $command_string, ?array $array_entry_declarations): \Tlf\Lexer2\Command {
        $command = new \Tlf\Lexer2\Command();
        $command->declaration = $command_string;
        $command->array_entry_declarations = $array_entry_declarations;
        //$command = [
            //'declaration' => $command_string,
            //'object'=>null, // string identifier referencing an object in the global scope (or maybe local scope too), not object reference
            //'method'=>null, // string identifier, not callable reference
            //'args'=>[],
        //];

        $line = trim($command_string);
        //var_dump($line);
        $first_space_pos = strpos($line, " ");
        if ($first_space_pos===false)$first_space_pos = strlen($line);
        $declared_command = substr($line, 0, $first_space_pos);
        $args_list_string = substr($line, $first_space_pos+1);
        $command->declared_command = "'$declared_command'";

        // declared command can be:
        // - stdlib_method     # Note: You don't write stdlib_. Just the method name that is available in the stdlib.
        // - object.method
        // - object.property.method    # Can we go deeper? object.property.property.method ?
        // - (MAYBE) object.!method.submethod     # Where object.!method returns an object & submethod is a method on that returned object.
        // - :directive
        // - namespace:directive
        // - localvar=

        $stdlib_reg = '/^([a-zA-Z_]+)$/';
        $objmethod_reg = '/^([a-zA-Z_]+)\\.([a-zA-Z_]+)$/';
        $nested_object_reg = '/^((?:(?:[a-zA-Z_]+)\\.){2})([a-zA-Z_]+)$/';
        $directive_activation_reg = '/^([a-zA-Z_]+)?\:([a-zA-Z_]+)/';
        $declare_localvar_reg = '/^([a-zA-Z_]+)=/';


        if (preg_match($stdlib_reg,$declared_command,$matches)){
            $command->object = 'StdLib';
            $command->method = $matches[1];
        } else if (preg_match($declare_localvar_reg, $declared_command, $matches)){
            $command->object = 'StdLib';
            $command->method = 'set_arg';
            $command->args[] = $matches[1].' ';
            //print_r($command);
            //exit;
        } else if (preg_match($objmethod_reg, $declared_command, $matches)){
            $command->object = $matches[1];
            $command->method = $matches[2];
        } else if (preg_match($nested_object_reg, $declared_command, $matches)){
            $command->object = substr($matches[1],0,-1); // remove the trailing dot
            $command->method = $matches[2];
        } else if (preg_match($directive_activation_reg, $declared_command, $matches)) {
            $command->object = 'StdLib';
            $command->method = 'activateDirective';
            $command->args[] = $matches[1]=='' ? null : $matches[1]; 
            $command->args[] = $matches[2];
        } 


        // what does a command consist of?
        // - An object to call
        // - A method on that object 
        // - A list of arguments
        //
        // Notes:
        // - The first non-breaking string is the command name
        // - The command name may be a shorthand, and need to be converted into the long-hand form for the output command. 
            // - Ex: :pet_name becomes `Lexer.Activate_Directive pet_name`, or something to that effect
        // - An argument `[a-zA-Z]=` is a shorthand COMMAND that declares a variable in the current scope (wait, scope exists? lol)
        // - An argument that starts with `!` is a command that has to be parsed.
            // How do you terminate a !command? Like if arg1 is `!command subarg` and there is also an arg2 ?
            // I guess you need a way to terminate a `!` argument.
            // but typically ! args should be the last one, so most code can be simpler & not require a termination char.
            // What's the termination char?
        // - If arg1 is an array, can there be an arg2?
            // YES, but you'd have to declare a variable as an array & then pass the variable


        $command->args = array_merge($command->args, $this->get_command_arguments($args_list_string, $array_entry_declarations));
        return $command;
    }

    
    /**
     * Get an array of arguments for a Command, from a declared list of arguments.
     *
     * @param $args_list_string string arguments. For command 'buffer.match /regex/', this will be string '/regex/'
     * @param $array_entry_declarations ?array<int index, string $arg_declaration> declarations of arguments, like "name=$1" & "type=Person"
     *
     * @return array<int index, mixed value> array of arguments. Arguments are strings or Commmands.
     */
    public function get_command_arguments(string $args_list_string, ?array $array_entry_declarations): array {
        if (trim($args_list_string)=='')return [];
        $args = explode(" ", $args_list_string);

        $ret_args = [];
        foreach ($args as $index=>$arg){

            if (substr($arg,0,1)=='!'){
                $sub_command_string = substr($arg,1)." ".implode(" ", array_slice($args,$index+1));
                $sub_command = $this->parse_command_string($sub_command_string, null); 
                if ($array_entry_declarations!=null){
                    $sub_command->args[] = $this->parse_array_entry_declarations($array_entry_declarations);
                }
                $ret_args[] = $sub_command->toArray();
                $array_entry_declarations = null;
                break;
            } else if (substr($arg,0,1)=='$'){
                $sub_command = new \Tlf\Lexer2\Command();
                $sub_command->declaration = $arg;
                $sub_command->object = 'StdLib';
                $sub_command->method = 'get_arg';
                $sub_command->args = [substr($arg,1)];

                $ret_args[] = $sub_command->toArray();
            } else {
                $ret_args[] = $arg;
            }
        }

        if ($array_entry_declarations!==null){
            $ret_args[] = $this->parse_array_entry_declarations($array_entry_declarations);
        }

        return $ret_args;
    }

    /**
     * Parse the code into a structure, but don't parse commands themselves.
     *
     *
     * @param $parser_code string 
     *
     * @return array<string directive_name, array instruction_sets> structured array of code & arguments (commands and args remain as unparsed text)
     * @var instruction_sets array<string instruction_set_name, array unparsed_commands> 
     * @var unparsed_commands array<int index, array command_info> 
     * @var command_info array<string key, mixed value> w/ key(line) = the command's line & ?key(array_arg) being array<int index, string $arg_string>
     *
     */
    public function get_structured_code(string $parser_code): array {
        // Parse the code into a structure
        // directive:
            // instruction_set:
                // command # comment
                // command:
                    // key=value
                // command:
                    // =value # zero-index

        $directives = [];
        $lines = explode("\n", $parser_code);

        $initial_indent = 0;
        foreach (str_split($lines[0]) as $initial_indent=>$char){
            if ($char!=' ')break;
        }

        $state = 'global';

        //$initial_indent = 4;
        $discovered_indent = 0;
        $discovered_cmd_indent = 0;
        $discovered_array_arg_indent = 0;
        foreach ($lines as $line){

            $line_comment_regex = "/^\\s*\\#/";
            if (preg_match($line_comment_regex, $line)){
                // its a comment!
                //var_dump($line);
                continue;
            }
            $end_of_line_comment_regex = "/\s\#.*$/";
            $line = preg_replace($end_of_line_comment_regex, "", $line);

            $directive_regex = "/^\\s{{$initial_indent},{$initial_indent}}([a-zA-Z_]+)\\:/";


            // we listen for a directive at current indent or an instruction at the next indent level
            $minimum_indent = $initial_indent + 1;
            $maximum_indent = $initial_indent + 11; // why 10? idk. Seems like enough.
            if ($discovered_indent!=0){
                $minimum_indent = $discovered_indent;
                $maximum_indent = $discovered_indent;
            }
            $instruction_set_regex = "/^(\\s{{$minimum_indent},{$maximum_indent}})([a-zA-Z_]+)\\:/";


            $cmd_indent_min = $maximum_indent + 1;
            $cmd_indent_max = $maximum_indent + 11;
            if ($discovered_cmd_indent != 0){
                $cmd_indent_min = $discovered_cmd_indent;
                $cmd_indent_max = $discovered_cmd_indent;
            }
            $command_regex = "/^(\\s{{$cmd_indent_min},{$cmd_indent_max}})([^\\s].*)$/";

            $arr_indent_min = $cmd_indent_max + 1;
            $arr_indent_max = $cmd_indent_max + 11;
            if ($discovered_array_arg_indent!=0){
                $arr_indent_min = $discovered_array_arg_indent;
                $arr_indent_max = $discovered_array_arg_indent;
            }
            $array_arg_regex = "/^(\\s{{$arr_indent_min},{$arr_indent_max}})([^\\s].*)$/";

            //var_dump($line);
            if (preg_match($directive_regex, $line, $matches)){
                $directive_name = $matches[1];
                $directives[$directive_name] = [];
                $discovered_indent = 0;
            } else if (preg_match($instruction_set_regex, $line, $matches)){ 
                    //echo 'MATCH FOUND!';
                    $discovered_indent = strlen($matches[1]);
                    $instruction_set_name = $matches[2];
                    $directives[$directive_name][$instruction_set_name] = [];
                    $discovered_cmd_indent = 0; 
            } else if (preg_match($command_regex, $line, $matches)){
                $command_index = count($directives[$directive_name][$instruction_set_name] ?? []); 
                $directives[$directive_name][$instruction_set_name][$command_index] = 
                    [
                        'line'=>$matches[2],
                        //'array_arg'=>[],
                    ];
                $discovered_cmd_indent = strlen($matches[1]);
                $discovered_array_arg_indent = 0;
            } else if (preg_match($array_arg_regex, $line, $matches)
                && substr($directives[$directive_name][$instruction_set_name][$command_index]['line'],-1)==':'
            ){
                $discovered_array_arg_indent = strlen($matches[1]);
                $directives[$directive_name][$instruction_set_name][$command_index]['line']
                    = substr($directives[$directive_name][$instruction_set_name][$command_index]['line'],0,-1);
                $directives[$directive_name][$instruction_set_name][$command_index]['array_arg'][] = $matches[2];
            }

        }
        return $directives;
    }

}