Add CSV to Pipe delimited. Add corpus texts

2026-04-06 09:58:32 -04:00
parent 535e07a61c
commit 2e3dd2fff0
7 changed files with 10304 additions and 4 deletions
--- a/Chartwell.py
+++ b/Chartwell.py
@@ -83,7 +83,7 @@ import torch
 # -------------------------
 # Knowledge base selection
 # -------------------------
-BOOK_DIR = 'Books/Religeon'  # just a string
+BOOK_DIR = 'Books/Procedure'  # just a string
 book_files = []

 for f in Path(BOOK_DIR).rglob('*'):
@@ -162,20 +162,91 @@ lm_model = GPT4All(
    allow_download=allow_download
 )

+# ----------------
+# Table Narration
+# ----------------
+# This will detect and create narrations for table data in pipe form
+# For example.
+#| Year | Squadrons | Aircraft |
+#|------|-----------|----------|
+#| 1939 | 21        | 252      |
+#| 1940 | 35        | 420      |
+
+# If adding data to a corpus try to use this standard form for instance
+#| Metric | Value | Context |
+#|--------|-------|---------|
+#| Standard deduction single 2025 | $15,750 | Under age 65 |
+#| Standard deduction single 2025 | $17,750 | Age 65 or older |
+#| Standard deduction MFJ 2025 | $31,500 | Both under 65 |
+
+def narrate_table(text):
+    """
+    Detect and convert pipe-delimited tables 
+    to narrative prose before chunking.
+    """
+    lines = text.split('\n')
+    result = []
+    i = 0
+    narrative_count = 0
+    table_count = 0
+    
+    while i < len(lines):
+        line = lines[i].strip()
+        
+        if '|' in line and line.count('|') >= 2:
+            table_lines = []
+            while i < len(lines) and '|' in lines[i]:
+                table_lines.append(lines[i].strip())
+                i += 1
+            
+            data_lines = [l for l in table_lines 
+                         if not re.match(r'^[\|\-\s:]+$', l)]
+            
+            if len(data_lines) >= 2:
+                table_count += 1
+                headers = [h.strip() for h in data_lines[0].split('|') 
+                          if h.strip()]
+                
+                narratives = []
+                for row_line in data_lines[1:]:
+                    values = [v.strip() for v in row_line.split('|') 
+                             if v.strip()]
+                    if len(values) == len(headers):
+                        parts = [f"{headers[j]} was {values[j]}" 
+                                for j in range(len(headers))]
+                        sentence = "In this record, " + ", ".join(parts) + "."
+                        narratives.append(sentence)
+                        narrative_count += 1
+                
+                result.append(" ".join(narratives))
+            else:
+                result.extend(table_lines)
+        else:
+            result.append(lines[i])
+            i += 1
+
+    if table_count > 0:
+        print(f"  [Table narration: {table_count} table(s) detected, "
+              f"{narrative_count} row(s) converted]")
+
+    return '\n'.join(result)
+
 # -------------------------
 # Clean text
 # -------------------------
 def clean_text(text):
+    # Narrate tables before any other cleaning
+    text = narrate_table(text)
+    
+    # existing cleaning...
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'(?<=[a-z])(\d{1,3})(?=\s[A-Z])', '', text)
-#    text = re.sub(r'\s\d{1,4}\s', ' ', text)
    text = re.sub(r'[■•◆▪→]', '', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\[\d+\]', '', text)
-    text = re.sub(r'\[citation needed\]', '', text)    
+    text = re.sub(r'\[citation needed\]', '', text)
    return text.strip()
-
 # -------------------------
 # Chunk text with overlap
 # -------------------------