Commit Latest

This commit is contained in:
2026-04-04 14:23:16 -04:00
parent 5e5d8cbcb6
commit 3ccc529045
11 changed files with 222740 additions and 11 deletions

View File

@@ -12,6 +12,19 @@ import torch
# Augmented — add that retrieved context to the prompt
# Generation — use the language model to generate an answer based on that context
# -------------------
# Embedding Cleaning
# -------------------
# del embeddings_cache.npz
# del embeddings_cache_meta.json
# -------------------
# TO-DO
# -----------------
# Better table handling
# Update requirements.txt with torch installation notes
# Domain-specific clean profiles
# ---------------
# Running
# --------------
@@ -67,11 +80,7 @@ import torch
# Semantic chunking
# Better table handling
# -------------------
# Embedding Cleaning
# -------------------
# del embeddings_cache.npz
# del embeddings_cache_meta.json
# -------------------------
@@ -84,11 +93,11 @@ for f in Path(BOOK_DIR).rglob('*'):
if not f.is_file():
continue
try:
with open(f, 'r', encoding='utf-8'):
with open(f, 'rb'): # just check file is readable
pass
book_files.append(str(f)) # store as string, not Path
except (UnicodeDecodeError, PermissionError):
continue
book_files.append(str(f))
except PermissionError:
continue
print(f"Found {len(book_files)} files")
@@ -260,8 +269,14 @@ else:
print(f"Warning: {book_name} not found, skipping...")
continue
print(f"Loading {book_name}...")
with open(book_name, "r", encoding="utf-8") as f:
book_text = clean_text(f.read())
with open(book_name, "rb") as f:
raw = f.read()
try:
text = raw.decode("utf-8")
except UnicodeDecodeError:
print(f"[Encoding fallback] {book_name}")
text = raw.decode("cp1252") # fallback for Windows-encoded text
book_text = clean_text(text)
book_chunks = chunk_text(book_text)
all_chunks.extend(book_chunks)
all_sources.extend([book_name] * len(book_chunks))