created the initial draft of a cleaner for the pure pdftotext output

2025-08-15 00:00:21 +02:00 · 2025-08-15 00:00:21 +02:00 · 523b0f3e82
commit 523b0f3e82
parent b4891f3061
2 changed files with 46 additions and 1 deletions
--- a/step-0-Cleaner.sh
+++ b/step-0-Cleaner.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+infile=""
+outfile=""
+titleFile=""
+outpath=""
+
+getTitles() {
+	cat $infile | \
+		grep -A $(wc -l $_ | \
+		awk '{print $1}') "Table of Contents" | \
+		grep -B $(wc -l $_ | \
+		awk '{print $1}') "How to Use this Book" | \
+		grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \
+		grep -v "Credits" | \
+		grep -v "How to Use this Book" | \
+		grep -Eo "^[\ \#\ﬃ\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile
+}
+
+fixScan(){
+	sed -E s/ﬃ/ffi/gm $infile
+}
+
+GetListFromTitle() {
+	local titlecount=$(wc -l $titleFile | awk '{print $1}')
+	local currentlineinum=1
+	while [ $currentlinenum -lt $titlecount ]; do
+		local currentline=$(sed -n "$currentlinenum p")
+		(($currentlinenum++))
+		local nextline=$(sed -n "$currentlinenum p")
+		grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
+		grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
+	done
+}
+
+
+
+
+echo "Fixing spelling from certain pdftotext bugs"
+fixScan
+