#!/usr/bin/env bash
##
# Convert image & pdf files to text
#
##
# Convert pdf files to plain text
#
# @arg file relative path to pdf file to convert, relative to current working directory
# @return void
# @output transcription directory, transcription files created by whisper, clean-transcription.txt
#
function ocr(){
file="$1";
filename_without_extension="${file%.*}"
final_txt_file="${filename_without_extension}-ocr.txt"
image_dir_rel="gen-ocr-img-${filename_without_extension}/"
image_dir="$(pwd)/${image_dir_rel}"
text_dir_rel="gen-ocr-txt-${filename_without_extension}/"
text_dir="$(pwd)/${text_dir_rel}"
mkdir "$image_dir"
mkdir "$text_dir"
echo "pdftoppm start"
pdftoppm -png "$file" "$image_dir"
echo "pdftoppm done"
#
pagenum=0
echo "****" >> "$final_txt_file"
echo "This file is generated via Tesseract OCR and may contain errors when compared with the original PDF." >> "$final_txt_file"
echo "See Tesseract OCR at https://github.com/tesseract-ocr/tesseract" >> "$final_txt_file"
echo "****" >> "$final_txt_file"
for i in "${image_dir}"/*.png;do
pagenum=$((pagenum + 1))
echo "Tesseract ${i}\n"
base="$(basename "$i")"
out_file="${text_dir_rel}${base}"
tesseract "${i}" "$out_file"
echo "" >> "$final_txt_file"
echo "########" >> "$final_txt_file"
echo "# Page ${pagenum}" >> "$final_txt_file"
echo "--------" >> "$final_txt_file"
cat "${out_file}.txt" >> "$final_txt_file"
done
}
##
# Clean a .tsv file into a human-readable form
#
# @arg file relative path to tsv file, within current working directory
# @return void
# @output a .md file with timestamped transcript in human-readable format
#
function transcribe_cleantsv(){
file="$1";
src="$(pwd)/$file"
php "$codeDir/lib/clean-transcription.php" "$src"
}
##
# Create a .md file with proper disclaimer header and links to timestamped portions of the uploaded video
#
# Expects directory structure provided by primary transcribe function
#
# @arg1 file relative path to audio file that was transcribed
# @arg2 yt_url the url to a youtube video representing this audio
# @arg3 offset the offset, in seconds, for the beginning of this audio in the yt video
#
# @return void
# @output `${file}.md` in the current working directory, without the audio-files extension
#
function transcribe_makemd(){
audio_filename="$1"
audio_file="$(pwd)/$1";
# file_without_ext="${audio_filename%.*}"
# transcription_dir="${audio_file}-transcription/"
# tsv_file="${transcription_dir}/${file_without_ext}.tsv"
yt_url="$2"
offset="$3"
if [[ -z "$offset" ]];then
offset=0
fi
php "$codeDir/lib/transcribe-makemd.php" "$audio_file" "$yt_url" "$offset"
}