40 lines
987 B
Bash
40 lines
987 B
Bash
#!/bin/bash
|
||
infile=""
|
||
outfile=""
|
||
titleFile=""
|
||
outpath=""
|
||
|
||
getTitles() {
|
||
cat $infile | \
|
||
grep -A $(wc -l $_ | \
|
||
awk '{print $1}') "Table of Contents" | \
|
||
grep -B $(wc -l $_ | \
|
||
awk '{print $1}') "How to Use this Book" | \
|
||
grep -E "[\ \#\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}\.{1,}" | \
|
||
grep -v "Credits" | \
|
||
grep -v "How to Use this Book" | \
|
||
grep -Eo "^[\ \#\ffi\-\,\&\'\`\´\’\–A-Za-z0-9_]{1,999}[^.]" > $titleFile
|
||
}
|
||
|
||
fixScan(){
|
||
sed -E s/ffi/ffi/gm $infile
|
||
}
|
||
|
||
GetListFromTitle() {
|
||
local titlecount=$(wc -l $titleFile | awk '{print $1}')
|
||
local currentlineinum=1
|
||
while [ $currentlinenum -lt $titlecount ]; do
|
||
local currentline=$(sed -n "$currentlinenum p")
|
||
(($currentlinenum++))
|
||
local nextline=$(sed -n "$currentlinenum p")
|
||
grep -A $(wc - l $infile | awk '{print $1}') "$currentline" $infile | \
|
||
grep -B $(wc - l $infile | awk '{print $1}') "$nextline" > "$path/$currentline.txt"
|
||
done
|
||
}
|
||
|
||
|
||
|
||
|
||
echo "Fixing spelling from certain pdftotext bugs"
|
||
fixScan
|
||
|