created the initial draft of a cleaner for the pure pdftotext output

2025-08-15 00:00:21 +02:00 · 2025-08-15 00:00:21 +02:00 · 523b0f3e82
commit 523b0f3e82
parent b4891f3061
2 changed files with 46 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -2,12 +2,17 @@
 A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13
 Initially these scripts are intended for "The book of random tables" series by Matt Davids, but will look into adding other sources as I find need for it.
 ## Step 0
 ```bash
 pdf2text something.pdf
 ```
 turns pdf into text files
 ## Step 1
 ```bash
 greatSplitter.sh split
@ -21,7 +26,7 @@ This does however assume, that the source is cleaned beforehand and each empty l
 ## Step 2
 ```bash
-fundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
+foundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
 ```
 recommended is auto, as that performes all changes
--- a/step-0-Cleaner.sh
+++ b/step-0-Cleaner.sh
@ -0,0 +1,40 @@
 #!/bin/bash
 infile=""
 outfile=""
 titleFile=""
 outpath=""
 getTitles() {
 	cat $infile | \
 		grep -A $(wc -l $_ | \
 		awk '{print $1}') "Table of Contents" | \
 		grep -B $(wc -l $_ | \
 		awk '{print $1}') "How to Use this Book" | \
 		grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \
 		grep -v "Credits" | \
 		grep -v "How to Use this Book" | \
 		grep -Eo "^[\ \#\ﬃ\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile
 }
 fixScan(){
 	sed -E s/ﬃ/ffi/gm $infile
 }
 GetListFromTitle() {
 	local titlecount=$(wc -l $titleFile | awk '{print $1}')
 	local currentlineinum=1
 	while [ $currentlinenum -lt $titlecount ]; do
 		local currentline=$(sed -n "$currentlinenum p")
 		(($currentlinenum++))
 		local nextline=$(sed -n "$currentlinenum p")
 		grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
 		grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
 	done
 }
 echo "Fixing spelling from certain pdftotext bugs"
 fixScan