From 523b0f3e82da8e0ce446be5a65c70d285d36756ad3e819dba118235ed8e81880 Mon Sep 17 00:00:00 2001
From: Techognito <techognito@noreply.localhost>
Date: Fri, 15 Aug 2025 00:00:21 +0200
Subject: [PATCH] created the initial draft of a cleaner for the pure
 `pdftotext` output

---
 README.md         |  7 ++++++-
 step-0-Cleaner.sh | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 step-0-Cleaner.sh

diff --git a/README.md b/README.md
index 819e703..0a47ee0 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,17 @@
 
 A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13
 
+Initially these scripts are intended for "The book of random tables" series by Matt Davids, but will look into adding other sources as I find need for it.
 
 
 ## Step 0
 ```bash
 pdf2text something.pdf
 ```
+turns pdf into text files
+
+
+
 ## Step 1
 ```bash
 greatSplitter.sh split
@@ -21,7 +26,7 @@ This does however assume, that the source is cleaned beforehand and each empty l
 
 ## Step 2
 ```bash
-fundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
+foundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
 ```
 recommended is auto, as that performes all changes
 
diff --git a/step-0-Cleaner.sh b/step-0-Cleaner.sh
new file mode 100644
index 0000000..9314264
--- /dev/null
+++ b/step-0-Cleaner.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+infile=""
+outfile=""
+titleFile=""
+outpath=""
+
+getTitles() {
+	cat $infile | \
+		grep -A $(wc -l $_ | \
+		awk '{print $1}') "Table of Contents" | \
+		grep -B $(wc -l $_ | \
+		awk '{print $1}') "How to Use this Book" | \
+		grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \
+		grep -v "Credits" | \
+		grep -v "How to Use this Book" | \
+		grep -Eo "^[\ \#\ﬃ\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile
+}
+
+fixScan(){
+	sed -E s/ﬃ/ffi/gm $infile
+}
+
+GetListFromTitle() {
+	local titlecount=$(wc -l $titleFile | awk '{print $1}')
+	local currentlineinum=1
+	while [ $currentlinenum -lt $titlecount ]; do
+		local currentline=$(sed -n "$currentlinenum p")
+		(($currentlinenum++))
+		local nextline=$(sed -n "$currentlinenum p")
+		grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
+		grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
+	done
+}
+
+
+
+
+echo "Fixing spelling from certain pdftotext bugs"
+fixScan
+