Add CSV to Pipe delimited. Add corpus texts

This commit is contained in:
2026-04-06 09:58:32 -04:00
parent 535e07a61c
commit 2e3dd2fff0
7 changed files with 10304 additions and 4 deletions

View File

@@ -83,7 +83,7 @@ import torch
# -------------------------
# Knowledge base selection
# -------------------------
BOOK_DIR = 'Books/Religeon' # just a string
BOOK_DIR = 'Books/Procedure' # just a string
book_files = []
for f in Path(BOOK_DIR).rglob('*'):
@@ -162,20 +162,91 @@ lm_model = GPT4All(
allow_download=allow_download
)
# ----------------
# Table Narration
# ----------------
# This will detect and create narrations for table data in pipe form
# For example.
#| Year | Squadrons | Aircraft |
#|------|-----------|----------|
#| 1939 | 21 | 252 |
#| 1940 | 35 | 420 |
# If adding data to a corpus try to use this standard form for instance
#| Metric | Value | Context |
#|--------|-------|---------|
#| Standard deduction single 2025 | $15,750 | Under age 65 |
#| Standard deduction single 2025 | $17,750 | Age 65 or older |
#| Standard deduction MFJ 2025 | $31,500 | Both under 65 |
def narrate_table(text):
"""
Detect and convert pipe-delimited tables
to narrative prose before chunking.
"""
lines = text.split('\n')
result = []
i = 0
narrative_count = 0
table_count = 0
while i < len(lines):
line = lines[i].strip()
if '|' in line and line.count('|') >= 2:
table_lines = []
while i < len(lines) and '|' in lines[i]:
table_lines.append(lines[i].strip())
i += 1
data_lines = [l for l in table_lines
if not re.match(r'^[\|\-\s:]+$', l)]
if len(data_lines) >= 2:
table_count += 1
headers = [h.strip() for h in data_lines[0].split('|')
if h.strip()]
narratives = []
for row_line in data_lines[1:]:
values = [v.strip() for v in row_line.split('|')
if v.strip()]
if len(values) == len(headers):
parts = [f"{headers[j]} was {values[j]}"
for j in range(len(headers))]
sentence = "In this record, " + ", ".join(parts) + "."
narratives.append(sentence)
narrative_count += 1
result.append(" ".join(narratives))
else:
result.extend(table_lines)
else:
result.append(lines[i])
i += 1
if table_count > 0:
print(f" [Table narration: {table_count} table(s) detected, "
f"{narrative_count} row(s) converted]")
return '\n'.join(result)
# -------------------------
# Clean text
# -------------------------
def clean_text(text):
# Narrate tables before any other cleaning
text = narrate_table(text)
# existing cleaning...
text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
text = re.sub(r'\n+', ' ', text)
text = re.sub(r'(?<=[a-z])(\d{1,3})(?=\s[A-Z])', '', text)
# text = re.sub(r'\s\d{1,4}\s', ' ', text)
text = re.sub(r'[■•◆▪→]', '', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'\[\d+\]', '', text)
text = re.sub(r'\[citation needed\]', '', text)
text = re.sub(r'\[citation needed\]', '', text)
return text.strip()
# -------------------------
# Chunk text with overlap
# -------------------------