FVTT-pdf2json-rolltable-cre.../step-0-Cleaner.sh
Techognito d03ef3ea7c Step 0 fixes
- Issue #1 created because of the workaround for Bookofrandomtables1.pdf
2025-08-15 22:42:28 +02:00

161 lines
3.7 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
### Variables used in script
infile="bookofrandomtables1.txt"
##infile=$1
cleanFile="clean-$infile"
titleFile="titles-$infile"
outpath='./Step-1'
###
### Debugger
source ~/bin/debugger
debug=true
###
getTitles() {
cat $cleanFile | \
grep -A $(wc -l $cleanFile | \
awk '{print $1}') "Table of Contents" | \
grep -B $(wc -l $cleanFile | \
awk '{print $1}') "How to Use this Book" | \
grep -E "[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}\.{1,}" | \
grep -v "Credits" | \
grep -v "How to Use this Book" | \
grep -Eo "^[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}[^.]" > $titleFile
}
fixScan(){
cp $infile $cleanFile
sed -i 's/ffi/ffi/gm' $cleanFile
sed -i 's/Items in a Office/Items in an Office/gm' $cleanFile
}
GetListFromTitle() {
local titlecount=$(wc -l $titleFile | awk '{print $1}')
debugger "TitleCount: $titlecount"
local currentlinenum=1
while (( $currentlinenum < $titlecount )); do
debugger ""
debugger ""
debugger ""
debugger "$currentlinenum / $titlecount"
lineTitle=$(sed -n "$currentlinenum p" $titleFile)
debugger "Title from Title File= $lineTitle"
currentline=$(getTitleLine "$lineTitle")
debugger "Current Line $currentline"
# currentline=""
# currentline="$(cat $cleanFile | grep "$lineTitle" | grep -v '\.' )"
# debugger "Title from $cleanFile: $currentline"
# if [ -z $currentline ] ; then
# grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$lineTitle")"
# debugger "Grep commands = $grepcommand"
# currentline="$(eval "$grepcommand")"
# grepcommand=""
# fi
# debugger "Current Line = $currentline"
#
#
debugger "CurrentLineNum: $currentlinenum"
local tableNum=$currentlinenum
currentlinenum=$(( $currentlinenum + 1 ))
debugger "CurrentLineNum After+1: $currentlinenum"
nextLineTitle=$(sed -n "$currentlinenum p" $titleFile)
debugger "Next Lines Title= $nextLineTitle"
nextline=$(getTitleLine "$nextLineTitle")
debugger "Next Line $nextline"
local maxlines=$(wc -l $cleanFile | awk '{print $1}')
debugger $currentline
debugger $nextline
#exit
cat $cleanFile | \
grep -A $maxlines "Good adventuring" | \
grep -A $maxlines "$currentline" | \
grep -B $maxlines "$nextline" > "$outpath/$tableNum-$currentline.txt"
done
local currentline=$(sed -n "$currentlinenum p" $titleFile)
debugger HEAERGAERGA
debugger $currentline
debugger EFAEFAWFE
cat $cleanFile | grep -A $maxlines "$currentline" > "$outpath/$titlecount-$currentline.txt"
}
function getTitleLine() {
output_line=""
output_line="$(cat $cleanFile | grep "$1" | grep -v '\.' )"
echo "output before badgrepper: $output_line" > $(tty)
if [ -z "$output_line" ] ; then
grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$1")"
output_line="$(eval "$grepcommand")"
grepcommand=""
fi
echo "output after badgrepper: $output_line" > $(tty)
if [ -z "$output_line" ] ; then
echo "Unable to match Table of Contents Title with title further down in document" >&2
exit 1
else
echo $output_line
fi
}
function dumpOfAllMyRandomCommandsAndStuff() {
# command for turning string variable into array variable
# echo $1
for i in $1;do
grepme+=("$i")
# echo $i
done
# echo ${array[@]}
}
function badGrepper() {
grepme=()
aegr=$1
dumpOfAllMyRandomCommandsAndStuff "$aegr"
iamgrep=""
for i in ${grepme[@]}; do
iamgrep+="grep \"$i\""
if [[ "$i" != "${grepme[-1]}" ]]; then
iamgrep+=' | '
fi
done
echo $iamgrep
}
case $1 in
testing)
echo HELLO
grepresult=$(badGrepper)
echo "The following is the command '$grepresult'"
;;
*)
debugger "Fixing spelling from certain pdftotext bugs"
fixScan
debugger "Getting Titles"
getTitles
mkdir -p $outpath
debugger "Creating lists"
GetListFromTitle
;;
esac