Updated step-0 again and added debugger script

Step 0 progress:
- The grep workaround now works, but causes problems if there are mutliple titles with that are similar (e.g., "Items in a Office" and "Items in a Port Master's Office" as the grepper will provide both when searching for the first)
- Only 1 known bug remains (the one listed above)
This commit is contained in:
Techognito 2025-08-15 21:21:20 +02:00
parent a8547a5f8e
commit d47c8f88e1
2 changed files with 83 additions and 21 deletions

94
step-0-Cleaner.sh Normal file → Executable file
View file

@ -6,9 +6,9 @@ outpath='./Step-1'
getTitles() {
cat $cleanFile | \
grep -A $(wc -l $infile | \
grep -A $(wc -l $cleanFile | \
awk '{print $1}') "Table of Contents" | \
grep -B $(wc -l $infile | \
grep -B $(wc -l $cleanFile | \
awk '{print $1}') "How to Use this Book" | \
grep -E "[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}\.{1,}" | \
grep -v "Credits" | \
@ -22,42 +22,94 @@ fixScan(){
GetListFromTitle() {
local titlecount=$(wc -l $titleFile | awk '{print $1}')
echo "TitleCount: $titlecount"
local currentlinenum=1
while (( $currentlinenum < $titlecount )); do
echo ""
echo ""
echo ""
echo "$currentlinenum / $titlecount"
lineTitle=$(sed -n "$currentlinenum p" $titleFile)
echo "Title from Title File= $lineTitle"
# sed -n "$currentlinenum p" $titleFile
#
local currentline=$(sed -n "$currentlinenum p" $titleFile)
grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$lineTitle")"
echo "Grep commands = $grepcommand"
local currentline="$(eval "$grepcommand")"
grepcommand=""
echo "Current Line = $currentline"
echo "CurrentLineNum: $currentlinenum"
local tableNum=$currentlinenum
currentlinenum=$(( $currentlinenum + 1 ))
echo "CurrentLineNum After+1: $currentlinenum"
local nextline=$(sed -n "$currentlinenum p" $titleFile)
nextLineTitle=$(sed -n "$currentlinenum p" $titleFile)
echo "Next Lines Title= $nextLineTitle"
grepcommand="cat $cleanFile | grep -vE '\.' | $(badGrepper "$nextLineTitle")"
echo "nexlines grepcommand: $grepcommand"
local nextline="$(eval "$grepcommand")"
echo "Next Line $nextline"
local maxlines=$(wc -l $cleanFile | awk '{print $1}')
echo $currentline
echo $nextline
cat $infile | \
grep -A $(wc -l $infile | awk '{print $1}') "Good adventuring" | \
grep -A $(wc -l $infile | awk '{print $1}') "$currentline" | \
grep -B $(wc -l $infile | awk '{print $1}') "$nextline" > "$outpath/$tableNum-$currentline.txt"
#exit
cat $cleanFile | \
grep -A $maxlines "Good adventuring" | \
grep -A $maxlines "$currentline" | \
grep -B $maxlines "$nextline" > "$outpath/$tableNum-$currentline.txt"
done
local currentline=$(sed -n "$currentlinenum p" $titleFile)
cat $inline | grep -A $(wc -l $inline | awk '{print $1}') $currentline > "$outpath/$tableNum-$currentline.txt"
echo HEAERGAERGA
echo $currentline
echo EFAEFAWFE
cat $cleanFile | grep -A $maxlines "$currentline" > "$outpath/$titlecount-$currentline.txt"
}
function dumpOfAllMyRandomCommandsAndStuff() {
# command for turning string variable into array variable
echo $filter; for i in $filter; do array+=("$i"); echo $i; done; echo ${array[@]}
# echo $1
for i in $1;do
grepme+=("$i")
# echo $i
done
# echo ${array[@]}
}
function badGrepper() {
grepme=()
aegr=$1
dumpOfAllMyRandomCommandsAndStuff "$aegr"
iamgrep=""
for i in ${grepme[@]}; do
iamgrep+="grep \"$i\""
if [[ "$i" != "${grepme[-1]}" ]]; then
iamgrep+=' | '
fi
done
echo $iamgrep
}
echo "Fixing spelling from certain pdftotext bugs"
fixScan
echo "Getting Titles"
getTitles
mkdir $outpath
echo "Creating lists"
GetListFromTitle
case $1 in
testing)
echo HELLO
grepresult=$(badGrepper)
echo "The following is the command '$grepresult'"
;;
*)
echo "Fixing spelling from certain pdftotext bugs"
fixScan
echo "Getting Titles"
getTitles
mkdir -p $outpath
echo "Creating lists"
GetListFromTitle
;;
esac