Commit Latest

2026-04-04 14:23:16 -04:00
parent 5e5d8cbcb6
commit 3ccc529045
11 changed files with 222740 additions and 11 deletions
--- a/Chartwell.py
+++ b/Chartwell.py
@@ -12,6 +12,19 @@ import torch
 # Augmented — add that retrieved context to the prompt
 # Generation — use the language model to generate an answer based on that context

+# -------------------
+# Embedding Cleaning
+# -------------------
+# del embeddings_cache.npz
+# del embeddings_cache_meta.json
+
+# -------------------
+# TO-DO
+# -----------------
+# Better table handling
+# Update requirements.txt with torch installation notes
+# Domain-specific clean profiles
+
 # ---------------
 # Running
 # --------------
@@ -67,11 +80,7 @@ import torch
 # Semantic chunking
 # Better table handling

-# -------------------
-# Embedding Cleaning
-# -------------------
-# del embeddings_cache.npz
-# del embeddings_cache_meta.json
+


 # -------------------------
@@ -84,11 +93,11 @@ for f in Path(BOOK_DIR).rglob('*'):
    if not f.is_file():
        continue
    try:
-        with open(f, 'r', encoding='utf-8'):
+        with open(f, 'rb'):  # just check file is readable
            pass
-        book_files.append(str(f))  # store as string, not Path
-    except (UnicodeDecodeError, PermissionError):
-        continue
+        book_files.append(str(f))
+    except PermissionError:
+        continue        
 print(f"Found {len(book_files)} files")


@@ -260,8 +269,14 @@ else:
            print(f"Warning: {book_name} not found, skipping...")
            continue
        print(f"Loading {book_name}...")
-        with open(book_name, "r", encoding="utf-8") as f:
-            book_text = clean_text(f.read())
+        with open(book_name, "rb") as f:
+            raw = f.read()
+        try:
+            text = raw.decode("utf-8")
+        except UnicodeDecodeError:
+            print(f"[Encoding fallback] {book_name}")
+            text = raw.decode("cp1252")  # fallback for Windows-encoded text
+        book_text = clean_text(text)            
        book_chunks = chunk_text(book_text)
        all_chunks.extend(book_chunks)
        all_sources.extend([book_name] * len(book_chunks))