Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f7d0978148 |
Binary file not shown.
@@ -1 +1 @@
|
|||||||
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt", "Books\\Accounting\\Oracle\\Oracle.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241, "Books\\Accounting\\Oracle\\Oracle.txt": 262504}}
|
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241}}
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
232
Chartwell.py
232
Chartwell.py
@@ -141,20 +141,6 @@ MODES = {
|
|||||||
"End your response with a single period."
|
"End your response with a single period."
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
# "research": {
|
|
||||||
# "print_msg": "Research mode.",
|
|
||||||
# "prompt_instruction": (
|
|
||||||
# "You are a helpful research assistant. "
|
|
||||||
# "Restrict your response strictly to the provided context. "
|
|
||||||
# "If the source material is exhausted, stop writing. "
|
|
||||||
# "If a relationship or entity is not explicitly documented in the context, do not include it. "
|
|
||||||
# # "Do not infer, supplement, or use external training knowledge. "
|
|
||||||
# "Be direct and concise. "
|
|
||||||
# "Never repeat the context or instructions. "
|
|
||||||
# "Never echo the question. "
|
|
||||||
# "End your answer with a single period. "
|
|
||||||
# )
|
|
||||||
# },
|
|
||||||
|
|
||||||
"research": {
|
"research": {
|
||||||
"print_msg": "Research mode.",
|
"print_msg": "Research mode.",
|
||||||
@@ -162,31 +148,19 @@ MODES = {
|
|||||||
"You are a helpful research assistant. "
|
"You are a helpful research assistant. "
|
||||||
"Restrict your response strictly to the provided context. "
|
"Restrict your response strictly to the provided context. "
|
||||||
"If the source material is exhausted, stop writing. "
|
"If the source material is exhausted, stop writing. "
|
||||||
# "If the context does not contain enough information to fully answer, provide only what is explicitly supported or say you don't know. "
|
|
||||||
"If a relationship or entity is not explicitly documented in the context, do not include it. "
|
"If a relationship or entity is not explicitly documented in the context, do not include it. "
|
||||||
"Do not repeat the same information in different wording. "
|
"Do not repeat the same information in different wording. "
|
||||||
"If multiple context passages express the same idea, summarize it once. "
|
"If multiple context passages express the same idea, summarize it once. "
|
||||||
"If the context contains repetitive legal or procedural text, merge it into a single concise statement. "
|
"If the context contains repetitive legal or procedural text, merge it into a single concise statement. "
|
||||||
"Do not list multiple similar verses. "
|
"Do not list multiple similar verses. "
|
||||||
"Prefer one coherent explanation over multiple extracted quotations. "
|
"Prefer one coherent explanation over multiple extracted quotations. "
|
||||||
# "Do not infer, supplement, or use external training knowledge. "
|
|
||||||
"Do not infer, guess, or use external knowledge under any circumstances. "
|
"Do not infer, guess, or use external knowledge under any circumstances. "
|
||||||
"If the answer is not explicitly stated in the context, respond with: \"I don't know based on the provided context.\""
|
|
||||||
# "Be direct and concise. "
|
|
||||||
"Never repeat the context or instructions. "
|
"Never repeat the context or instructions. "
|
||||||
"Never echo the question. "
|
"Never echo the question. "
|
||||||
"End your answer with a single period. "
|
"End your answer with a single period. "
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
|
|
||||||
# "advanced": {
|
|
||||||
# "print_msg": "Advanced mode.",
|
|
||||||
# "prompt_instruction": (
|
|
||||||
# "You are adept at mathematics and computer programming. "
|
|
||||||
# "You are a linguist able to put together complex ideas and work with formulations and workflows."
|
|
||||||
# )
|
|
||||||
# },
|
|
||||||
|
|
||||||
"advanced": {
|
"advanced": {
|
||||||
"print_msg": "Advanced mode.",
|
"print_msg": "Advanced mode.",
|
||||||
"prompt_instruction": (
|
"prompt_instruction": (
|
||||||
@@ -420,19 +394,6 @@ def clean_text(text):
|
|||||||
# This is for the enrichment pipeline if it is enabled. It Uses the local LLM to extract key metadata from a chunk.
|
# This is for the enrichment pipeline if it is enabled. It Uses the local LLM to extract key metadata from a chunk.
|
||||||
# --------------------------------
|
# --------------------------------
|
||||||
|
|
||||||
# PROMPT_TEMPLATE = (
|
|
||||||
# "<|start_header_id|>user<|end_header_id|>\n"
|
|
||||||
# "Extract tags from the text below. Respond ONLY in this exact format: "
|
|
||||||
# "[Time: | Loc: | Entity: | Topic:]\n"
|
|
||||||
# # "[Time: | Loc: | Entity: | Theme:]\n"
|
|
||||||
# "ALL fields must be filled. If uncertain, make a best guess.\n"
|
|
||||||
# "Limit to 5 most important entities. No explanation.\n"
|
|
||||||
# "Text: {text}\n"
|
|
||||||
# "Tags: [<|eot_id|>\n"
|
|
||||||
# "<|start_header_id|>assistant<|end_header_id|>\n"
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
PROMPT_TEMPLATE = (
|
PROMPT_TEMPLATE = (
|
||||||
"<|start_header_id|>user<|end_header_id|>\n"
|
"<|start_header_id|>user<|end_header_id|>\n"
|
||||||
"TAGGING OPERATION. NOT A CONVERSATION. NO EXPLANATIONS.\n"
|
"TAGGING OPERATION. NOT A CONVERSATION. NO EXPLANATIONS.\n"
|
||||||
@@ -585,30 +546,21 @@ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
|
|||||||
# -------------------------
|
# -------------------------
|
||||||
# Check if cache is valid
|
# Check if cache is valid
|
||||||
# -------------------------
|
# -------------------------
|
||||||
# def cache_is_valid():
|
|
||||||
# print (f"Checking for existing enriched cache in {BOOK_DIR}...")
|
|
||||||
# if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
|
|
||||||
# return False
|
|
||||||
# with open(CACHE_META, "r") as f:
|
|
||||||
# meta = json.load(f)
|
|
||||||
# if meta.get("book_files") != book_files:
|
|
||||||
# return False
|
|
||||||
# for book_name in book_files:
|
|
||||||
# if not os.path.exists(book_name):
|
|
||||||
# continue
|
|
||||||
# stored_size = meta.get("file_sizes", {}).get(book_name)
|
|
||||||
# actual_size = os.path.getsize(book_name)
|
|
||||||
# if stored_size != actual_size:
|
|
||||||
# return False
|
|
||||||
# return True
|
|
||||||
|
|
||||||
def cache_is_valid():
|
def cache_is_valid():
|
||||||
print(f"\nChecking for existing enriched cache in {BOOK_DIR}...")
|
print(f"\nChecking for existing enriched cache in {BOOK_DIR}...")
|
||||||
|
|
||||||
# --- HARD FAIL: missing cache ---
|
status = {
|
||||||
|
"valid": True,
|
||||||
|
"added": [],
|
||||||
|
"modified": [],
|
||||||
|
"missing_embeddings": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- HARD FAIL: missing cache files ---
|
||||||
if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
|
if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
|
||||||
print("X Missing cache or metadata → rebuild required")
|
print("X Missing cache or metadata → rebuild required")
|
||||||
return False
|
status["valid"] = False
|
||||||
|
return status
|
||||||
|
|
||||||
with open(CACHE_META, "r") as f:
|
with open(CACHE_META, "r") as f:
|
||||||
meta = json.load(f)
|
meta = json.load(f)
|
||||||
@@ -616,69 +568,179 @@ def cache_is_valid():
|
|||||||
cached_files = set(meta.get("book_files", []))
|
cached_files = set(meta.get("book_files", []))
|
||||||
current_files = set(book_files)
|
current_files = set(book_files)
|
||||||
|
|
||||||
added = current_files - cached_files
|
# --- Detect NEW files ---
|
||||||
|
status["added"] = list(current_files - cached_files)
|
||||||
missing_embeddings = []
|
|
||||||
modified_files = []
|
|
||||||
|
|
||||||
|
# --- Check EXISTING files ---
|
||||||
for book_name in current_files:
|
for book_name in current_files:
|
||||||
if not os.path.exists(book_name):
|
if not os.path.exists(book_name):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# NEW FILE → ignore for now
|
# Skip new files (handled separately)
|
||||||
if book_name not in cached_files:
|
if book_name not in cached_files:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
stored_size = meta.get("file_sizes", {}).get(book_name)
|
stored_size = meta.get("file_sizes", {}).get(book_name)
|
||||||
actual_size = os.path.getsize(book_name)
|
actual_size = os.path.getsize(book_name)
|
||||||
|
|
||||||
# EXISTING FILE but missing metadata → BAD
|
# Missing metadata entry → bad
|
||||||
if stored_size is None:
|
if stored_size is None:
|
||||||
missing_embeddings.append(book_name)
|
status["missing_embeddings"].append(book_name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# EXISTING FILE but changed → BAD
|
# File changed → needs reprocessing
|
||||||
if stored_size != actual_size:
|
if stored_size != actual_size:
|
||||||
modified_files.append(book_name)
|
status["modified"].append(book_name)
|
||||||
|
|
||||||
# --- HARD FAIL CONDITIONS ---
|
# --- HARD FAIL CONDITIONS ---
|
||||||
if missing_embeddings:
|
if status["missing_embeddings"]:
|
||||||
print(f"\nX Missing embeddings for {len(missing_embeddings)} file(s):")
|
print(f"\nX Missing embeddings for {len(status['missing_embeddings'])} file(s):")
|
||||||
for f in sorted(missing_embeddings):
|
for f in sorted(status["missing_embeddings"]):
|
||||||
print(f" * {f}")
|
print(f" * {f}")
|
||||||
print("→ Rebuild required")
|
status["valid"] = False
|
||||||
return False
|
|
||||||
|
|
||||||
if modified_files:
|
if status["modified"]:
|
||||||
print(f"\nX {len(modified_files)} file(s) changed:")
|
print(f"\nX {len(status['modified'])} file(s) changed:")
|
||||||
for f in sorted(modified_files):
|
for f in sorted(status["modified"]):
|
||||||
print(f" * {f}")
|
print(f" * {f}")
|
||||||
print("→ Rebuild required")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# --- SOFT WARNING ---
|
# --- SOFT WARNING ---
|
||||||
if added:
|
if status["added"]:
|
||||||
print(f"\n+ {len(added)} new file(s) detected (not yet embedded):")
|
print(f"\n+ {len(status['added'])} new file(s) detected:")
|
||||||
for f in sorted(added):
|
for f in sorted(status["added"]):
|
||||||
print(f" + {f}")
|
print(f" + {f}")
|
||||||
print("→ Continuing with existing cache (new files will be ignored)")
|
|
||||||
|
|
||||||
print("\n✓ Cache usable")
|
if status["valid"]:
|
||||||
return True
|
print("\n✓ Cache usable (incremental updates possible)")
|
||||||
|
else:
|
||||||
|
print("\n→ Full rebuild required")
|
||||||
|
|
||||||
|
return status
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Save updated cache file
|
||||||
|
# --------------------------------------------------
|
||||||
|
def save_updated_cache():
|
||||||
|
np.savez(
|
||||||
|
CACHE_FILE,
|
||||||
|
embeddings=chunk_embeddings,
|
||||||
|
chunks=np.array(all_chunks, dtype=object),
|
||||||
|
sources=np.array(all_sources, dtype=object)
|
||||||
|
)
|
||||||
|
|
||||||
|
file_sizes = {b: os.path.getsize(b) for b in book_files if os.path.exists(b)}
|
||||||
|
|
||||||
|
with open(CACHE_META, "w") as f:
|
||||||
|
json.dump({
|
||||||
|
"book_files": book_files,
|
||||||
|
"file_sizes": file_sizes
|
||||||
|
}, f)
|
||||||
|
|
||||||
|
print("Cache updated.")
|
||||||
|
|
||||||
|
# --------------------------------------------------
|
||||||
|
# Remove chunks from embeddings for specified files
|
||||||
|
# --------------------------------------------------
|
||||||
|
def remove_chunks_for_files(files_to_remove):
|
||||||
|
global all_chunks, all_sources, chunk_embeddings
|
||||||
|
|
||||||
|
if not files_to_remove:
|
||||||
|
return
|
||||||
|
|
||||||
|
keep_indices = [
|
||||||
|
i for i, src in enumerate(all_sources)
|
||||||
|
if src not in files_to_remove
|
||||||
|
]
|
||||||
|
|
||||||
|
all_chunks = [all_chunks[i] for i in keep_indices]
|
||||||
|
all_sources = [all_sources[i] for i in keep_indices]
|
||||||
|
chunk_embeddings = chunk_embeddings[keep_indices]
|
||||||
|
|
||||||
|
print(f"Removed old chunks for {len(files_to_remove)} modified file(s)")
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Process new and modified files
|
||||||
|
# -------------------------
|
||||||
|
def process_incremental_updates(status):
|
||||||
|
global all_chunks, all_sources, chunk_embeddings
|
||||||
|
|
||||||
|
files_to_process = status["added"] + status["modified"]
|
||||||
|
|
||||||
|
# Step 1 — remove outdated chunks (ONLY modified files)
|
||||||
|
remove_chunks_for_files(status["modified"])
|
||||||
|
|
||||||
|
new_chunks = []
|
||||||
|
new_sources = []
|
||||||
|
|
||||||
|
# Step 2 — process new + modified files
|
||||||
|
for book_name in files_to_process:
|
||||||
|
print(f"[Updating] {book_name}")
|
||||||
|
|
||||||
|
with open(book_name, "rb") as f:
|
||||||
|
raw = f.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = raw.decode("utf-8")
|
||||||
|
except:
|
||||||
|
text = raw.decode("cp1252")
|
||||||
|
|
||||||
|
# Skip IGNORE files
|
||||||
|
first_line = text.lstrip().splitlines()[0] if text.strip() else ""
|
||||||
|
if first_line.strip().upper().startswith("# IGNORE"):
|
||||||
|
print(f"Skipping {book_name} (marked IGNORE)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
book_text = clean_text(text)
|
||||||
|
chunks = chunk_text(book_text)
|
||||||
|
|
||||||
|
new_chunks.extend(chunks)
|
||||||
|
new_sources.extend([book_name] * len(chunks))
|
||||||
|
|
||||||
|
# Step 3 — nothing to add
|
||||||
|
if not new_chunks:
|
||||||
|
print("No new chunks to add.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 4 — embed
|
||||||
|
print(f"Embedding {len(new_chunks)} new chunks...")
|
||||||
|
new_embeddings = embed_model.encode(new_chunks, convert_to_tensor=False)
|
||||||
|
|
||||||
|
# Step 5 — append
|
||||||
|
if len(all_chunks) == 0:
|
||||||
|
# edge case: empty cache
|
||||||
|
all_chunks = new_chunks
|
||||||
|
all_sources = new_sources
|
||||||
|
chunk_embeddings = np.array(new_embeddings)
|
||||||
|
else:
|
||||||
|
all_chunks.extend(new_chunks)
|
||||||
|
all_sources.extend(new_sources)
|
||||||
|
chunk_embeddings = np.vstack([chunk_embeddings, new_embeddings])
|
||||||
|
|
||||||
|
# Step 6 — save
|
||||||
|
save_updated_cache()
|
||||||
|
|
||||||
# -------------------------
|
# -------------------------
|
||||||
# Load or build embeddings
|
# Load or build embeddings
|
||||||
# -------------------------
|
# -------------------------
|
||||||
all_chunks = []
|
all_chunks = []
|
||||||
all_sources = []
|
all_sources = []
|
||||||
|
|
||||||
if cache_is_valid():
|
status = cache_is_valid()
|
||||||
|
|
||||||
|
# if cache_is_valid():
|
||||||
|
if status["valid"]:
|
||||||
print("Loading embeddings from cache...")
|
print("Loading embeddings from cache...")
|
||||||
data = np.load(CACHE_FILE, allow_pickle=True)
|
data = np.load(CACHE_FILE, allow_pickle=True)
|
||||||
chunk_embeddings = data["embeddings"]
|
chunk_embeddings = data["embeddings"]
|
||||||
all_chunks = list(data["chunks"])
|
all_chunks = list(data["chunks"])
|
||||||
all_sources = list(data["sources"])
|
all_sources = list(data["sources"])
|
||||||
print(f"Total chunks loaded from cache: {len(all_chunks)}")
|
print(f"Total chunks loaded from cache: {len(all_chunks)}")
|
||||||
|
|
||||||
|
# check if we have additions or modifications
|
||||||
|
if status["added"] or status["modified"]:
|
||||||
|
print("\n[Incremental update triggered]")
|
||||||
|
process_incremental_updates(status)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Building embeddings from scratch...")
|
print("Building embeddings from scratch...")
|
||||||
for book_name in book_files:
|
for book_name in book_files:
|
||||||
|
|||||||
Reference in New Issue
Block a user