FVTT-pdf2json-rolltable-cre.../step-0-Cleaner.sh
Techognito d47c8f88e1 Updated step-0 again and added debugger script
Step 0 progress:
- The grep workaround now works, but causes problems if there are mutliple titles with that are similar (e.g., "Items in a Office" and "Items in a Port Master's Office" as the grepper will provide both when searching for the first)
- Only 1 known bug remains (the one listed above)
2025-08-15 21:21:20 +02:00

115 lines
2.8 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
infile="bookofrandomtables1.txt"
cleanFile="clean-$infile"
titleFile="titles-$infile"
outpath='./Step-1'
getTitles() {
cat $cleanFile | \
grep -A $(wc -l $cleanFile | \
awk '{print $1}') "Table of Contents" | \
grep -B $(wc -l $cleanFile | \
awk '{print $1}') "How to Use this Book" | \
grep -E "[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}\.{1,}" | \
grep -v "Credits" | \
grep -v "How to Use this Book" | \
grep -Eo "^[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}[^.]" > $titleFile
}
fixScan(){
sed s/ffi/ffi/gm $infile > $cleanFile
}
GetListFromTitle() {
local titlecount=$(wc -l $titleFile | awk '{print $1}')
echo "TitleCount: $titlecount"
local currentlinenum=1
while (( $currentlinenum < $titlecount )); do
echo ""
echo ""
echo ""
echo "$currentlinenum / $titlecount"
lineTitle=$(sed -n "$currentlinenum p" $titleFile)
echo "Title from Title File= $lineTitle"
# sed -n "$currentlinenum p" $titleFile
grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$lineTitle")"
echo "Grep commands = $grepcommand"
local currentline="$(eval "$grepcommand")"
grepcommand=""
echo "Current Line = $currentline"
echo "CurrentLineNum: $currentlinenum"
local tableNum=$currentlinenum
currentlinenum=$(( $currentlinenum + 1 ))
echo "CurrentLineNum After+1: $currentlinenum"
nextLineTitle=$(sed -n "$currentlinenum p" $titleFile)
echo "Next Lines Title= $nextLineTitle"
grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$nextLineTitle")"
echo "nexlines grepcommand: $grepcommand"
local nextline="$(eval "$grepcommand")"
echo "Next Line $nextline"
local maxlines=$(wc -l $cleanFile | awk '{print $1}')
echo $currentline
echo $nextline
#exit
cat $cleanFile | \
grep -A $maxlines "Good adventuring" | \
grep -A $maxlines "$currentline" | \
grep -B $maxlines "$nextline" > "$outpath/$tableNum-$currentline.txt"
done
local currentline=$(sed -n "$currentlinenum p" $titleFile)
echo HEAERGAERGA
echo $currentline
echo EFAEFAWFE
cat $cleanFile | grep -A $maxlines "$currentline" > "$outpath/$titlecount-$currentline.txt"
}
function dumpOfAllMyRandomCommandsAndStuff() {
# command for turning string variable into array variable
# echo $1
for i in $1;do
grepme+=("$i")
# echo $i
done
# echo ${array[@]}
}
function badGrepper() {
grepme=()
aegr=$1
dumpOfAllMyRandomCommandsAndStuff "$aegr"
iamgrep=""
for i in ${grepme[@]}; do
iamgrep+="grep \"$i\""
if [[ "$i" != "${grepme[-1]}" ]]; then
iamgrep+=' | '
fi
done
echo $iamgrep
}
case $1 in
testing)
echo HELLO
grepresult=$(badGrepper)
echo "The following is the command '$grepresult'"
;;
*)
echo "Fixing spelling from certain pdftotext bugs"
fixScan
echo "Getting Titles"
getTitles
mkdir -p $outpath
echo "Creating lists"
GetListFromTitle
;;
esac