FVTT-pdf2json-rolltable-cre.../rolltabletxtfix.sh

44 lines
1.1 KiB
Bash
Executable file

#!/bin/bash
# Input and output file names
input_file="cleanme.txt"
output_file="iamclean.txt"
# Step 1: Remove unwanted characters and blank lines
# - Remove form feeds (``)
# - Remove empty lines
# - Remove lines that are just numbers (like page numbers)
sed -e 's/\f//g' -e '/^$/d' -e '/^[0-9]\+$/d' "$input_file" |
# Step 2: Handle lines where numbering and text are split across two lines
awk '
# If the line ends with a dot (e.g., "100.") without text, store it for the next line
/^[0-9]+\.$/ {
line_number = $1
next
}
# Combine the stored number with the current line if there is a pending number
line_number {
$0 = line_number " " $0
line_number = ""
}
# Print all valid lines
{ print }
' |
# Step 3: Renumber all lines sequentially
awk '
BEGIN { count = 1 }
/^[0-9]+\./ {
# Replace the current numbering with sequential numbering
sub(/^[0-9]+\./, count ".")
count++
}
{ print }
' > "$output_file"
echo "Cleaned and renumbered list saved to $output_file."