clean-transcription.php
<?php
/**
* Create a clean, human-readable copy of a `.tsv` transcription file
*
* @arg1 - `.tsv` file to parse and create clean output from
# @return void
# @output transcription directory, transcription files created by whisper, clean-transcription.txt
*/
$input = $argv[1];
if (!is_file($input)){
echo "Input file does not exist.\nPath: $input\n";
return;
} else if (substr($input,-4) != '.tsv'){
echo "Input file is not a .tsv file.\nPath: $input\n";
return;
}
$output = dirname($input).'/clean-transcription.txt';
$out = fopen($output, 'w');
$fh = fopen($input, 'r');
## Skip the first line
fgets($fh);
while ($line = fgets($fh)){
$parts = explode("\t", $line);
$startms = $parts[0];
$endms = $parts[1];
$text = trim($parts[2]);
$start = ((float)$startms)/((float)1000);
// $start = number_format($start,1);
$end = ((float)$endms)/((float)1000);
$end = number_format($end,1);
if ($start>=60){
$start = $start / 60;
$parts = explode('.', $start);
$minutes = $parts[0];
$startstr = "0.".($parts[1]??0);
$seconds = ((float)$startstr) * 60;
$seconds = number_format($seconds,0);
$start = $minutes.'m'.$seconds.'s';
// $start = number_format($start,0);
// $start .= 'm';
} else {
$start .= 's';
}
fwrite($out, "{$start}: $text\n");
}
fclose($fh);