clean-transcription.php

<?php
/**
 * Create a clean, human-readable copy of a `.tsv` transcription file
 *
 * @arg1 - `.tsv` file to parse and create clean output from
# @return void
# @output transcription directory, transcription files created by whisper, clean-transcription.txt 
 */


$input = $argv[1];
if (!is_file($input)){
    echo "Input file does not exist.\nPath: $input\n";
    return;
} else if (substr($input,-4) != '.tsv'){
    echo "Input file is not a .tsv file.\nPath: $input\n";
    return;
}
$output = dirname($input).'/clean-transcription.txt';
$out = fopen($output, 'w');

$fh = fopen($input, 'r');

## Skip the first line
fgets($fh);
while ($line = fgets($fh)){
    $parts = explode("\t", $line);

    $startms = $parts[0];
    $endms = $parts[1];
    $text = trim($parts[2]);

    $start = ((float)$startms)/((float)1000);
    // $start = number_format($start,1);
    $end = ((float)$endms)/((float)1000);
    $end = number_format($end,1);

    if ($start>=60){
        $start = $start / 60;
        $parts = explode('.', $start);
        $minutes = $parts[0];
        $startstr = "0.".($parts[1]??0);
        $seconds = ((float)$startstr) * 60;
        $seconds = number_format($seconds,0);
        $start = $minutes.'m'.$seconds.'s';
        // $start = number_format($start,0);
        // $start .= 'm';
    } else {
        $start .= 's';
    }

    fwrite($out, "{$start}: $text\n");
}

fclose($fh);