Add CSV to Pipe delimited. Add corpus texts
This commit is contained in:
79
Chartwell.py
79
Chartwell.py
@@ -83,7 +83,7 @@ import torch
|
||||
# -------------------------
|
||||
# Knowledge base selection
|
||||
# -------------------------
|
||||
BOOK_DIR = 'Books/Religeon' # just a string
|
||||
BOOK_DIR = 'Books/Procedure' # just a string
|
||||
book_files = []
|
||||
|
||||
for f in Path(BOOK_DIR).rglob('*'):
|
||||
@@ -162,20 +162,91 @@ lm_model = GPT4All(
|
||||
allow_download=allow_download
|
||||
)
|
||||
|
||||
# ----------------
|
||||
# Table Narration
|
||||
# ----------------
|
||||
# This will detect and create narrations for table data in pipe form
|
||||
# For example.
|
||||
#| Year | Squadrons | Aircraft |
|
||||
#|------|-----------|----------|
|
||||
#| 1939 | 21 | 252 |
|
||||
#| 1940 | 35 | 420 |
|
||||
|
||||
# If adding data to a corpus try to use this standard form for instance
|
||||
#| Metric | Value | Context |
|
||||
#|--------|-------|---------|
|
||||
#| Standard deduction single 2025 | $15,750 | Under age 65 |
|
||||
#| Standard deduction single 2025 | $17,750 | Age 65 or older |
|
||||
#| Standard deduction MFJ 2025 | $31,500 | Both under 65 |
|
||||
|
||||
def narrate_table(text):
|
||||
"""
|
||||
Detect and convert pipe-delimited tables
|
||||
to narrative prose before chunking.
|
||||
"""
|
||||
lines = text.split('\n')
|
||||
result = []
|
||||
i = 0
|
||||
narrative_count = 0
|
||||
table_count = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
if '|' in line and line.count('|') >= 2:
|
||||
table_lines = []
|
||||
while i < len(lines) and '|' in lines[i]:
|
||||
table_lines.append(lines[i].strip())
|
||||
i += 1
|
||||
|
||||
data_lines = [l for l in table_lines
|
||||
if not re.match(r'^[\|\-\s:]+$', l)]
|
||||
|
||||
if len(data_lines) >= 2:
|
||||
table_count += 1
|
||||
headers = [h.strip() for h in data_lines[0].split('|')
|
||||
if h.strip()]
|
||||
|
||||
narratives = []
|
||||
for row_line in data_lines[1:]:
|
||||
values = [v.strip() for v in row_line.split('|')
|
||||
if v.strip()]
|
||||
if len(values) == len(headers):
|
||||
parts = [f"{headers[j]} was {values[j]}"
|
||||
for j in range(len(headers))]
|
||||
sentence = "In this record, " + ", ".join(parts) + "."
|
||||
narratives.append(sentence)
|
||||
narrative_count += 1
|
||||
|
||||
result.append(" ".join(narratives))
|
||||
else:
|
||||
result.extend(table_lines)
|
||||
else:
|
||||
result.append(lines[i])
|
||||
i += 1
|
||||
|
||||
if table_count > 0:
|
||||
print(f" [Table narration: {table_count} table(s) detected, "
|
||||
f"{narrative_count} row(s) converted]")
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
# -------------------------
|
||||
# Clean text
|
||||
# -------------------------
|
||||
def clean_text(text):
|
||||
# Narrate tables before any other cleaning
|
||||
text = narrate_table(text)
|
||||
|
||||
# existing cleaning...
|
||||
text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
|
||||
text = re.sub(r'\n+', ' ', text)
|
||||
text = re.sub(r'(?<=[a-z])(\d{1,3})(?=\s[A-Z])', '', text)
|
||||
# text = re.sub(r'\s\d{1,4}\s', ' ', text)
|
||||
text = re.sub(r'[■•◆▪→]', '', text)
|
||||
text = re.sub(r' +', ' ', text)
|
||||
text = re.sub(r'\[\d+\]', '', text)
|
||||
text = re.sub(r'\[citation needed\]', '', text)
|
||||
text = re.sub(r'\[citation needed\]', '', text)
|
||||
return text.strip()
|
||||
|
||||
# -------------------------
|
||||
# Chunk text with overlap
|
||||
# -------------------------
|
||||
|
||||
Reference in New Issue
Block a user