Commit Latest
This commit is contained in:
37
Chartwell.py
37
Chartwell.py
@@ -12,6 +12,19 @@ import torch
|
||||
# Augmented — add that retrieved context to the prompt
|
||||
# Generation — use the language model to generate an answer based on that context
|
||||
|
||||
# -------------------
|
||||
# Embedding Cleaning
|
||||
# -------------------
|
||||
# del embeddings_cache.npz
|
||||
# del embeddings_cache_meta.json
|
||||
|
||||
# -------------------
|
||||
# TO-DO
|
||||
# -----------------
|
||||
# Better table handling
|
||||
# Update requirements.txt with torch installation notes
|
||||
# Domain-specific clean profiles
|
||||
|
||||
# ---------------
|
||||
# Running
|
||||
# --------------
|
||||
@@ -67,11 +80,7 @@ import torch
|
||||
# Semantic chunking
|
||||
# Better table handling
|
||||
|
||||
# -------------------
|
||||
# Embedding Cleaning
|
||||
# -------------------
|
||||
# del embeddings_cache.npz
|
||||
# del embeddings_cache_meta.json
|
||||
|
||||
|
||||
|
||||
# -------------------------
|
||||
@@ -84,11 +93,11 @@ for f in Path(BOOK_DIR).rglob('*'):
|
||||
if not f.is_file():
|
||||
continue
|
||||
try:
|
||||
with open(f, 'r', encoding='utf-8'):
|
||||
with open(f, 'rb'): # just check file is readable
|
||||
pass
|
||||
book_files.append(str(f)) # store as string, not Path
|
||||
except (UnicodeDecodeError, PermissionError):
|
||||
continue
|
||||
book_files.append(str(f))
|
||||
except PermissionError:
|
||||
continue
|
||||
print(f"Found {len(book_files)} files")
|
||||
|
||||
|
||||
@@ -260,8 +269,14 @@ else:
|
||||
print(f"Warning: {book_name} not found, skipping...")
|
||||
continue
|
||||
print(f"Loading {book_name}...")
|
||||
with open(book_name, "r", encoding="utf-8") as f:
|
||||
book_text = clean_text(f.read())
|
||||
with open(book_name, "rb") as f:
|
||||
raw = f.read()
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
print(f"[Encoding fallback] {book_name}")
|
||||
text = raw.decode("cp1252") # fallback for Windows-encoded text
|
||||
book_text = clean_text(text)
|
||||
book_chunks = chunk_text(book_text)
|
||||
all_chunks.extend(book_chunks)
|
||||
all_sources.extend([book_name] * len(book_chunks))
|
||||
|
||||
Reference in New Issue
Block a user