ocr.bash

#!/usr/bin/env bash
##
# Convert image & pdf files to text
#


##
# Convert pdf files to plain text
#
# @arg file relative path to pdf file to convert, relative to current working directory
# @return void
# @output transcription directory, transcription files created by whisper, clean-transcription.txt 
#
function ocr(){
    file="$1";
    filename_without_extension="${file%.*}"

    final_txt_file="${filename_without_extension}-ocr.txt"
    image_dir_rel="gen-ocr-img-${filename_without_extension}/"
    image_dir="$(pwd)/${image_dir_rel}"
    text_dir_rel="gen-ocr-txt-${filename_without_extension}/"
    text_dir="$(pwd)/${text_dir_rel}"

    mkdir "$image_dir"
    mkdir "$text_dir"

    echo "pdftoppm start"
    pdftoppm -png "$file" "$image_dir"
    echo "pdftoppm done"
    #

    pagenum=0

    echo "****" >> "$final_txt_file"
    echo "This file is generated via Tesseract OCR and may contain errors when compared with the original PDF." >> "$final_txt_file"
    echo "See Tesseract OCR at https://github.com/tesseract-ocr/tesseract" >> "$final_txt_file"
    echo "****" >> "$final_txt_file"

    for i in "${image_dir}"/*.png;do
        pagenum=$((pagenum + 1))
        echo "Tesseract ${i}\n"
        base="$(basename "$i")"
        out_file="${text_dir_rel}${base}"
        tesseract "${i}" "$out_file"

        echo "" >> "$final_txt_file"
        echo "########" >> "$final_txt_file"
        echo "# Page ${pagenum}" >> "$final_txt_file"
        echo "--------" >> "$final_txt_file"
        cat "${out_file}.txt" >> "$final_txt_file"
    done

}

##
# Clean a .tsv file into a human-readable form
#
# @arg file relative path to tsv file, within current working directory
# @return void
# @output a .md file with timestamped transcript in human-readable format
#
function transcribe_cleantsv(){
    file="$1";
    src="$(pwd)/$file"
    php "$codeDir/lib/clean-transcription.php" "$src"
}

##
# Create a .md file with proper disclaimer header and links to timestamped portions of the uploaded video
#
# Expects directory structure provided by primary transcribe function
#
# @arg1 file relative path to audio file that was transcribed
# @arg2 yt_url the url to a youtube video representing this audio
# @arg3 offset the offset, in seconds, for the beginning of this audio in the yt video
#
# @return void
# @output `${file}.md` in the current working directory, without the audio-files extension
#
function transcribe_makemd(){
    audio_filename="$1"
    audio_file="$(pwd)/$1";
    # file_without_ext="${audio_filename%.*}"
    # transcription_dir="${audio_file}-transcription/"
    # tsv_file="${transcription_dir}/${file_without_ext}.tsv"

    yt_url="$2"
    
    offset="$3"
    if [[ -z "$offset" ]];then
        offset=0
    fi
    
    php "$codeDir/lib/transcribe-makemd.php" "$audio_file" "$yt_url" "$offset"

}