From 523b0f3e82da8e0ce446be5a65c70d285d36756ad3e819dba118235ed8e81880 Mon Sep 17 00:00:00 2001 From: Techognito Date: Fri, 15 Aug 2025 00:00:21 +0200 Subject: [PATCH] created the initial draft of a cleaner for the pure `pdftotext` output --- README.md | 7 ++++++- step-0-Cleaner.sh | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 step-0-Cleaner.sh diff --git a/README.md b/README.md index 819e703..0a47ee0 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,17 @@ A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13 +Initially these scripts are intended for "The book of random tables" series by Matt Davids, but will look into adding other sources as I find need for it. ## Step 0 ```bash pdf2text something.pdf ``` +turns pdf into text files + + + ## Step 1 ```bash greatSplitter.sh split @@ -21,7 +26,7 @@ This does however assume, that the source is cleaned beforehand and each empty l ## Step 2 ```bash -fundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile] +foundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile] ``` recommended is auto, as that performes all changes diff --git a/step-0-Cleaner.sh b/step-0-Cleaner.sh new file mode 100644 index 0000000..9314264 --- /dev/null +++ b/step-0-Cleaner.sh @@ -0,0 +1,40 @@ +#!/bin/bash +infile="" +outfile="" +titleFile="" +outpath="" + +getTitles() { + cat $infile | \ + grep -A $(wc -l $_ | \ + awk '{print $1}') "Table of Contents" | \ + grep -B $(wc -l $_ | \ + awk '{print $1}') "How to Use this Book" | \ + grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \ + grep -v "Credits" | \ + grep -v "How to Use this Book" | \ + grep -Eo "^[\ \#\ffi\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile +} + +fixScan(){ + sed -E s/ffi/ffi/gm $infile +} + +GetListFromTitle() { + local titlecount=$(wc -l $titleFile | awk '{print $1}') + local currentlineinum=1 + while [ $currentlinenum -lt $titlecount ]; do + local currentline=$(sed -n "$currentlinenum p") + (($currentlinenum++)) + local nextline=$(sed -n "$currentlinenum p") + grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \ + grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt" + done +} + + + + +echo "Fixing spelling from certain pdftotext bugs" +fixScan +