transcribe-makemd.php

<?php
/**
 *
 * Create a .md file with proper disclaimer header and links to timestamped portions of the uploaded video
 *
 * Expects directory structure provided by primary transcribe function
 *
 * @arg1 audio_file relative path to audio file that was transcribed
 * @arg2 yt_url the url to a youtube video representing this audio
 * @arg3 offset the offset, in seconds, for the beginning of this audio in the yt video
 *
 * @return void
 * @output `${audio_file}.md` in the current working directory, without the audio-files extension
 */

$yt_url = $argv[2];
$offset = (int)$argv[3];

$use_links = true;

if (!filter_var($yt_url, FILTER_VALIDATE_URL)){
    echo "\nValid youtube url SHOULD be supplied as second argument. Generating without timestamp links.\n";
    $use_links = false;
}


$audio_file = $argv[1];
$audio_file_without_ext = pathinfo($audio_file, PATHINFO_FILENAME);
$audio_file_basename = pathinfo($audio_file, PATHINFO_BASENAME);
$transcription_dir = $audio_file.'-transcription';
$tsv_file = $transcription_dir.'/'.$audio_file_without_ext.'.tsv';
$out_dir = dirname($audio_file);
$md_file = $out_dir.'/'.$audio_file_without_ext.'.md';

if (!is_file($tsv_file)){
    echo "\n.tsv file does not exist.\nPath: $tsv_file\n";
    return;
} 

if (file_exists($md_file)){
    echo "\nFile '".basename($md_file)."' exists. Move or delete file and run makemd again.\n";
    return;
}

$out = fopen($md_file, 'w');

$fh = fopen($tsv_file, 'r');


$md_audio_link = $use_links ? "See [original audio]({$yt_url})" : "";

$md_header = <<<MD
# Auto-Generated Transcription of {$audio_file_basename}
This transcription is generated using OpenAI's tool [whisper](https://github.com/openai/whisper), using the `small` model.

**WARNING:** This transcription contains errors and should be used as an assistive guide, not an authoritative reference. 

$md_audio_link

Transcription:

MD;

fwrite($out, $md_header);


## Skip the first line
fgets($fh);
while ($line = fgets($fh)){
    $parts = explode("\t", $line);

    $startms = $parts[0];
    $endms = $parts[1];
    $text = trim($parts[2]);

    $start = ((float)$startms)/((float)1000);
    $start = $start + $offset;
    $start_seconds = (int)$start;
    // $start = number_format($start,1);
    $end = ((float)$endms)/((float)1000);
    $end = number_format($end,1);

    if ($start>=60){
        $start = $start / 60;
        $parts = explode('.', $start);
        $minutes = $parts[0];
        $startstr = "0.".($parts[1]??0);
        $seconds = ((float)$startstr) * 60;
        $seconds = number_format($seconds,0);
        $start_string = $minutes.'m'.$seconds.'s';
        // $start = number_format($start,0);
        // $start .= 'm';
    } else {
        $start_string = $start.'s';
    }

    if ($use_links){
        $url = $yt_url.'&t='.$start_seconds;
        $time_out = "[{$start_string}]({$url}): $text\n";
    } else {
        $time_out = "{$start_string}: $text\n"; 
    }

    fwrite($out, "$time_out");
}

fclose($fh);