<?php
/**
*
* Create a .md file with proper disclaimer header and links to timestamped portions of the uploaded video
*
* Expects directory structure provided by primary transcribe function
*
* @arg1 audio_file relative path to audio file that was transcribed
* @arg2 yt_url the url to a youtube video representing this audio
* @arg3 offset the offset, in seconds, for the beginning of this audio in the yt video
*
* @return void
* @output `${audio_file}.md` in the current working directory, without the audio-files extension
*/
$yt_url = $argv[2];
$offset = (int)$argv[3];
$use_links = true;
if (!filter_var($yt_url, FILTER_VALIDATE_URL)){
echo "\nValid youtube url SHOULD be supplied as second argument. Generating without timestamp links.\n";
$use_links = false;
}
$audio_file = $argv[1];
$audio_file_without_ext = pathinfo($audio_file, PATHINFO_FILENAME);
$audio_file_basename = pathinfo($audio_file, PATHINFO_BASENAME);
$transcription_dir = $audio_file.'-transcription';
$tsv_file = $transcription_dir.'/'.$audio_file_without_ext.'.tsv';
$out_dir = dirname($audio_file);
$md_file = $out_dir.'/'.$audio_file_without_ext.'.md';
if (!is_file($tsv_file)){
echo "\n.tsv file does not exist.\nPath: $tsv_file\n";
return;
}
if (file_exists($md_file)){
echo "\nFile '".basename($md_file)."' exists. Move or delete file and run makemd again.\n";
return;
}
$out = fopen($md_file, 'w');
$fh = fopen($tsv_file, 'r');
$md_audio_link = $use_links ? "See [original audio]({$yt_url})" : "";
$md_header = <<<MD
# Auto-Generated Transcription of {$audio_file_basename}
This transcription is generated using OpenAI's tool [whisper](https://github.com/openai/whisper), using the `small` model.
**WARNING:** This transcription contains errors and should be used as an assistive guide, not an authoritative reference.
$md_audio_link
Transcription:
MD;
fwrite($out, $md_header);
## Skip the first line
fgets($fh);
while ($line = fgets($fh)){
$parts = explode("\t", $line);
$startms = $parts[0];
$endms = $parts[1];
$text = trim($parts[2]);
$start = ((float)$startms)/((float)1000);
$start = $start + $offset;
$start_seconds = (int)$start;
// $start = number_format($start,1);
$end = ((float)$endms)/((float)1000);
$end = number_format($end,1);
if ($start>=60){
$start = $start / 60;
$parts = explode('.', $start);
$minutes = $parts[0];
$startstr = "0.".($parts[1]??0);
$seconds = ((float)$startstr) * 60;
$seconds = number_format($seconds,0);
$start_string = $minutes.'m'.$seconds.'s';
// $start = number_format($start,0);
// $start .= 'm';
} else {
$start_string = $start.'s';
}
if ($use_links){
$url = $yt_url.'&t='.$start_seconds;
$time_out = "[{$start_string}]({$url}): $text\n";
} else {
$time_out = "{$start_string}: $text\n";
}
fwrite($out, "$time_out");
}
fclose($fh);