Converter.php

<?php

namespace Tlf;

class DataConverter {

    protected array $sources = [];

    public function __construct(){}

    public function addSource($source_dir){
        if (!is_dir($source_dir)){
            throw new \Exception("Directory '$source_dir' does not exist or is not a directory");
        }
        $this->sources[] = $source_dir;
    }

    public function convert($options=[]){
        $options = array_merge($this->defaults(), $options);
        extract($options);
        foreach ($this->sources as $dir){
            $input = $dir.'/source.ods'; //later make this accept any source name
            if ($gen_csv)$this->convert_to_csv($dir.'/source.ods');
            if ($gen_json)$this->convert_to_json($dir.'/out.csv');
            if ($gen_sql)$this->convert_to_sql($dir.'/out.json');
            if ($gen_sqlite)$this->convert_to_sqlite($dir.'/out.sql');
        }
    }


    /**
     * Convert a .ods file to .csv using `libreoffice` system command
     */
    public function convert_to_csv(string $source_ods){
        // old command that did not produce valid utf8
        // $cmd = "libreoffice --headless --convert-to csv \"$odsDir\"/* --outdir \"$csvDir\"";

        //--infilter is to force utf8 & comes from https://unix.stackexchange.com/questions/259361/specify-encoding-with-libreoffice-convert-to-csv
        // I have no idea how it works
        $out_dir = dirname($source_ods).'/out/';
        $cmd = "libreoffice --headless --convert-to csv --infilter=CSV:44,34,76,1 \"$source_ods\"/* --outdir \"$out_dir\"";

        //there is an alternate solution requiring an additional step for file format conversion available at: https://superuser.com/questions/199799/vim-shows-strange-characters-91-92#
        system($cmd);

        $this->clean_csv($out_dir.'/source.csv', dirname($source_ods).'/out.csv');
    }


    public function clean_csv($dirtyCsvFile, $write_to){
        
        $csv = \League\Csv\Reader::createFromPath($dirtyCsvFile, 'r');
        // row 9 has db column names
        // row 10 has spreadsheet column names (human friendly)
        // row 11 (offset 10) has first row of data
        $csv->setHeaderOffset(8); 
        $header = $csv->getHeader(); //returns the CSV header record

        $cols = [];
        foreach ($header as $index=>$colName){
            if (trim($colName)==''||substr($colName,0,2)=='--')break;
            $cols[] = $colName;
        }

        $f = fopen($write_to, 'w');
        if (!$f){
            throw new \Exception("Could not open ".$write_to);
        }
        fputcsv($f, $cols);

        $has_written = false;
        foreach ($csv->getRecords($cols) as $offset => $record){
            if ($offset<10)continue;
            $record = $this->clean_row($cols,$record);
            if ($record == null)continue;
            fputcsv($f, $record);
            $has_written = true;
        }
        fclose($f);
    }

    /**
     * CSV file must be clean & contain headers at row 0 & first row of data at row 1
     * @param $source_csv the path to the source csv file
     * @param $out_file path to the output file or null to use out.json in $source_csv's directory
     */
    public function convert_to_json(string $source_csv, $out_file=null){

        $out_file = $out_file ?? dirname($source_csv).'/out.json';

        $csv = \League\Csv\Reader::createFromPath($source_csv, 'r');
        $csv->setHeaderOffset(0); 
        $header = $csv->getHeader(); //returns the CSV header record

        $f = fopen($out_file, 'w');
        if (!$f){
            throw new \Exception("Could not open ".$jsonPath);
        }
        fwrite($f, "[\n");
        $has_written = false;
        foreach ($csv->getRecords($header) as $offset => $record){
            if ($has_written) fwrite($f,','."\n");
            fwrite($f,json_encode($record));
            $has_written = true;
        }
        fwrite($f, "\n]");
        fclose($f);
    }
    


    /**
     *
     * @see clean_value()
     * @return an array with cleaned-up data
     */
    public function clean_row($cols, $row){
        if (trim($row[$cols[0]])==''
            ||trim($row[$cols[1]])=='')return null;
        $out = [];
        foreach ($row as $key=>$value){
            $value = $this->clean_value($key,$value);
            $out[$key] = $value;
        }

        return $out;
    }

    /**
     * - Rounds percents to 2 decimal places & casts to float
     * - converts 'na', 'n/a', and empty string to null
     *
     * @param $key the key for the column
     * @param $value the value to modify
     *
     * @return a cleaned up version of the value (sometimes null)
     */
    public function clean_value($key, $value){
        if (substr($value,-1)=='%'){
            $value = (float)substr($value,0,-1);
            $value = round($value,2);
            // 2 decimal places & remove the %
        } else if ($value=='na'||$value=='n/a'){
            $value = null;
        } else if ($value === ''){
            $value = null;
        }

        return $value;
    }


    public function defaults(){

        $defaults = [ 
            // bugfix: Fileformat encoding leaves some characters incompatible with utf8, so apply a fix with libreoffice cli
            'use_filetype_conversion'=>true,

            // WARNING: OVERWRITES FILES if true
            // none of the files should be edited, generally. They're meant to be built from .ods spreadsheets
            // Good to set them false if testing some kind of error
            //
            'gen_csv'=>true,
            'gen_json'=>true,
            'gen_sql'=>true,

            'gen_sqlite'=>true,

        ];

        return $defaults;
    }
}