created the initial draft of a cleaner for the pure pdftotext output

This commit is contained in:
Techognito 2025-08-15 00:00:21 +02:00
parent b4891f3061
commit 523b0f3e82
2 changed files with 46 additions and 1 deletions

View file

@ -2,12 +2,17 @@
A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13 A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13
Initially these scripts are intended for "The book of random tables" series by Matt Davids, but will look into adding other sources as I find need for it.
## Step 0 ## Step 0
```bash ```bash
pdf2text something.pdf pdf2text something.pdf
``` ```
turns pdf into text files
## Step 1 ## Step 1
```bash ```bash
greatSplitter.sh split greatSplitter.sh split
@ -21,7 +26,7 @@ This does however assume, that the source is cleaned beforehand and each empty l
## Step 2 ## Step 2
```bash ```bash
fundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile] foundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
``` ```
recommended is auto, as that performes all changes recommended is auto, as that performes all changes

40
step-0-Cleaner.sh Normal file
View file

@ -0,0 +1,40 @@
#!/bin/bash
infile=""
outfile=""
titleFile=""
outpath=""
getTitles() {
cat $infile | \
grep -A $(wc -l $_ | \
awk '{print $1}') "Table of Contents" | \
grep -B $(wc -l $_ | \
awk '{print $1}') "How to Use this Book" | \
grep -E "[\ \#\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}\.{1,}" | \
grep -v "Credits" | \
grep -v "How to Use this Book" | \
grep -Eo "^[\ \#\ffi\-\,\&\'\`\´\\A-Za-z0-9_]{1,999}[^.]" > $titleFile
}
fixScan(){
sed -E s/ffi/ffi/gm $infile
}
GetListFromTitle() {
local titlecount=$(wc -l $titleFile | awk '{print $1}')
local currentlineinum=1
while [ $currentlinenum -lt $titlecount ]; do
local currentline=$(sed -n "$currentlinenum p")
(($currentlinenum++))
local nextline=$(sed -n "$currentlinenum p")
grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
done
}
echo "Fixing spelling from certain pdftotext bugs"
fixScan