created the initial draft of a cleaner for the pure pdftotext output
This commit is contained in:
parent
b4891f3061
commit
523b0f3e82
2 changed files with 46 additions and 1 deletions
|
|
@ -2,12 +2,17 @@
|
||||||
|
|
||||||
A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13
|
A set of BASH scripts for turning PDFs that contain multiple rolltables into Json filess that can be imported in FoundryVTT 12-13
|
||||||
|
|
||||||
|
Initially these scripts are intended for "The book of random tables" series by Matt Davids, but will look into adding other sources as I find need for it.
|
||||||
|
|
||||||
|
|
||||||
## Step 0
|
## Step 0
|
||||||
```bash
|
```bash
|
||||||
pdf2text something.pdf
|
pdf2text something.pdf
|
||||||
```
|
```
|
||||||
|
turns pdf into text files
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Step 1
|
## Step 1
|
||||||
```bash
|
```bash
|
||||||
greatSplitter.sh split
|
greatSplitter.sh split
|
||||||
|
|
@ -21,7 +26,7 @@ This does however assume, that the source is cleaned beforehand and each empty l
|
||||||
|
|
||||||
## Step 2
|
## Step 2
|
||||||
```bash
|
```bash
|
||||||
fundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
|
foundrycleaner.sh dice|title|numbers|sort|lines|auto [inputFile] [outputFile]
|
||||||
```
|
```
|
||||||
recommended is auto, as that performes all changes
|
recommended is auto, as that performes all changes
|
||||||
|
|
||||||
|
|
|
||||||
40
step-0-Cleaner.sh
Normal file
40
step-0-Cleaner.sh
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
#!/bin/bash
|
||||||
|
infile=""
|
||||||
|
outfile=""
|
||||||
|
titleFile=""
|
||||||
|
outpath=""
|
||||||
|
|
||||||
|
getTitles() {
|
||||||
|
cat $infile | \
|
||||||
|
grep -A $(wc -l $_ | \
|
||||||
|
awk '{print $1}') "Table of Contents" | \
|
||||||
|
grep -B $(wc -l $_ | \
|
||||||
|
awk '{print $1}') "How to Use this Book" | \
|
||||||
|
grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \
|
||||||
|
grep -v "Credits" | \
|
||||||
|
grep -v "How to Use this Book" | \
|
||||||
|
grep -Eo "^[\ \#\ffi\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile
|
||||||
|
}
|
||||||
|
|
||||||
|
fixScan(){
|
||||||
|
sed -E s/ffi/ffi/gm $infile
|
||||||
|
}
|
||||||
|
|
||||||
|
GetListFromTitle() {
|
||||||
|
local titlecount=$(wc -l $titleFile | awk '{print $1}')
|
||||||
|
local currentlineinum=1
|
||||||
|
while [ $currentlinenum -lt $titlecount ]; do
|
||||||
|
local currentline=$(sed -n "$currentlinenum p")
|
||||||
|
(($currentlinenum++))
|
||||||
|
local nextline=$(sed -n "$currentlinenum p")
|
||||||
|
grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
|
||||||
|
grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Fixing spelling from certain pdftotext bugs"
|
||||||
|
fixScan
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue