Incremental updates

This commit is contained in:
2026-04-25 08:57:38 -04:00
parent 72cfbe841f
commit f7d0978148
5 changed files with 149 additions and 87 deletions

View File

@@ -1 +1 @@
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt", "Books\\Accounting\\Oracle\\Oracle.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241, "Books\\Accounting\\Oracle\\Oracle.txt": 262504}}
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241}}

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -141,20 +141,6 @@ MODES = {
"End your response with a single period."
)
},
# "research": {
# "print_msg": "Research mode.",
# "prompt_instruction": (
# "You are a helpful research assistant. "
# "Restrict your response strictly to the provided context. "
# "If the source material is exhausted, stop writing. "
# "If a relationship or entity is not explicitly documented in the context, do not include it. "
# # "Do not infer, supplement, or use external training knowledge. "
# "Be direct and concise. "
# "Never repeat the context or instructions. "
# "Never echo the question. "
# "End your answer with a single period. "
# )
# },
"research": {
"print_msg": "Research mode.",
@@ -162,31 +148,19 @@ MODES = {
"You are a helpful research assistant. "
"Restrict your response strictly to the provided context. "
"If the source material is exhausted, stop writing. "
# "If the context does not contain enough information to fully answer, provide only what is explicitly supported or say you don't know. "
"If a relationship or entity is not explicitly documented in the context, do not include it. "
"Do not repeat the same information in different wording. "
"If multiple context passages express the same idea, summarize it once. "
"If the context contains repetitive legal or procedural text, merge it into a single concise statement. "
"Do not list multiple similar verses. "
"Prefer one coherent explanation over multiple extracted quotations. "
# "Do not infer, supplement, or use external training knowledge. "
"Do not infer, guess, or use external knowledge under any circumstances. "
"If the answer is not explicitly stated in the context, respond with: \"I don't know based on the provided context.\""
# "Be direct and concise. "
"Never repeat the context or instructions. "
"Never echo the question. "
"End your answer with a single period. "
)
},
# "advanced": {
# "print_msg": "Advanced mode.",
# "prompt_instruction": (
# "You are adept at mathematics and computer programming. "
# "You are a linguist able to put together complex ideas and work with formulations and workflows."
# )
# },
"advanced": {
"print_msg": "Advanced mode.",
"prompt_instruction": (
@@ -420,19 +394,6 @@ def clean_text(text):
# This is for the enrichment pipeline if it is enabled. It Uses the local LLM to extract key metadata from a chunk.
# --------------------------------
# PROMPT_TEMPLATE = (
# "<|start_header_id|>user<|end_header_id|>\n"
# "Extract tags from the text below. Respond ONLY in this exact format: "
# "[Time: | Loc: | Entity: | Topic:]\n"
# # "[Time: | Loc: | Entity: | Theme:]\n"
# "ALL fields must be filled. If uncertain, make a best guess.\n"
# "Limit to 5 most important entities. No explanation.\n"
# "Text: {text}\n"
# "Tags: [<|eot_id|>\n"
# "<|start_header_id|>assistant<|end_header_id|>\n"
# )
PROMPT_TEMPLATE = (
"<|start_header_id|>user<|end_header_id|>\n"
"TAGGING OPERATION. NOT A CONVERSATION. NO EXPLANATIONS.\n"
@@ -585,30 +546,21 @@ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
# -------------------------
# Check if cache is valid
# -------------------------
# def cache_is_valid():
# print (f"Checking for existing enriched cache in {BOOK_DIR}...")
# if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
# return False
# with open(CACHE_META, "r") as f:
# meta = json.load(f)
# if meta.get("book_files") != book_files:
# return False
# for book_name in book_files:
# if not os.path.exists(book_name):
# continue
# stored_size = meta.get("file_sizes", {}).get(book_name)
# actual_size = os.path.getsize(book_name)
# if stored_size != actual_size:
# return False
# return True
def cache_is_valid():
print(f"\nChecking for existing enriched cache in {BOOK_DIR}...")
# --- HARD FAIL: missing cache ---
status = {
"valid": True,
"added": [],
"modified": [],
"missing_embeddings": []
}
# --- HARD FAIL: missing cache files ---
if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
print("X Missing cache or metadata → rebuild required")
return False
status["valid"] = False
return status
with open(CACHE_META, "r") as f:
meta = json.load(f)
@@ -616,69 +568,179 @@ def cache_is_valid():
cached_files = set(meta.get("book_files", []))
current_files = set(book_files)
added = current_files - cached_files
missing_embeddings = []
modified_files = []
# --- Detect NEW files ---
status["added"] = list(current_files - cached_files)
# --- Check EXISTING files ---
for book_name in current_files:
if not os.path.exists(book_name):
continue
# NEW FILE → ignore for now
# Skip new files (handled separately)
if book_name not in cached_files:
continue
stored_size = meta.get("file_sizes", {}).get(book_name)
actual_size = os.path.getsize(book_name)
# EXISTING FILE but missing metadata → BAD
# Missing metadata entry → bad
if stored_size is None:
missing_embeddings.append(book_name)
status["missing_embeddings"].append(book_name)
continue
# EXISTING FILE but changed → BAD
# File changed → needs reprocessing
if stored_size != actual_size:
modified_files.append(book_name)
status["modified"].append(book_name)
# --- HARD FAIL CONDITIONS ---
if missing_embeddings:
print(f"\nX Missing embeddings for {len(missing_embeddings)} file(s):")
for f in sorted(missing_embeddings):
if status["missing_embeddings"]:
print(f"\nX Missing embeddings for {len(status['missing_embeddings'])} file(s):")
for f in sorted(status["missing_embeddings"]):
print(f" * {f}")
print("→ Rebuild required")
return False
status["valid"] = False
if modified_files:
print(f"\nX {len(modified_files)} file(s) changed:")
for f in sorted(modified_files):
if status["modified"]:
print(f"\nX {len(status['modified'])} file(s) changed:")
for f in sorted(status["modified"]):
print(f" * {f}")
print("→ Rebuild required")
return False
# --- SOFT WARNING ---
if added:
print(f"\n+ {len(added)} new file(s) detected (not yet embedded):")
for f in sorted(added):
if status["added"]:
print(f"\n+ {len(status['added'])} new file(s) detected:")
for f in sorted(status["added"]):
print(f" + {f}")
print("→ Continuing with existing cache (new files will be ignored)")
print("\n✓ Cache usable")
return True
if status["valid"]:
print("\n✓ Cache usable (incremental updates possible)")
else:
print("\n→ Full rebuild required")
return status
# --------------------------------------------------
# Save updated cache file
# --------------------------------------------------
def save_updated_cache():
np.savez(
CACHE_FILE,
embeddings=chunk_embeddings,
chunks=np.array(all_chunks, dtype=object),
sources=np.array(all_sources, dtype=object)
)
file_sizes = {b: os.path.getsize(b) for b in book_files if os.path.exists(b)}
with open(CACHE_META, "w") as f:
json.dump({
"book_files": book_files,
"file_sizes": file_sizes
}, f)
print("Cache updated.")
# --------------------------------------------------
# Remove chunks from embeddings for specified files
# --------------------------------------------------
def remove_chunks_for_files(files_to_remove):
global all_chunks, all_sources, chunk_embeddings
if not files_to_remove:
return
keep_indices = [
i for i, src in enumerate(all_sources)
if src not in files_to_remove
]
all_chunks = [all_chunks[i] for i in keep_indices]
all_sources = [all_sources[i] for i in keep_indices]
chunk_embeddings = chunk_embeddings[keep_indices]
print(f"Removed old chunks for {len(files_to_remove)} modified file(s)")
# -------------------------
# Process new and modified files
# -------------------------
def process_incremental_updates(status):
global all_chunks, all_sources, chunk_embeddings
files_to_process = status["added"] + status["modified"]
# Step 1 — remove outdated chunks (ONLY modified files)
remove_chunks_for_files(status["modified"])
new_chunks = []
new_sources = []
# Step 2 — process new + modified files
for book_name in files_to_process:
print(f"[Updating] {book_name}")
with open(book_name, "rb") as f:
raw = f.read()
try:
text = raw.decode("utf-8")
except:
text = raw.decode("cp1252")
# Skip IGNORE files
first_line = text.lstrip().splitlines()[0] if text.strip() else ""
if first_line.strip().upper().startswith("# IGNORE"):
print(f"Skipping {book_name} (marked IGNORE)")
continue
book_text = clean_text(text)
chunks = chunk_text(book_text)
new_chunks.extend(chunks)
new_sources.extend([book_name] * len(chunks))
# Step 3 — nothing to add
if not new_chunks:
print("No new chunks to add.")
return
# Step 4 — embed
print(f"Embedding {len(new_chunks)} new chunks...")
new_embeddings = embed_model.encode(new_chunks, convert_to_tensor=False)
# Step 5 — append
if len(all_chunks) == 0:
# edge case: empty cache
all_chunks = new_chunks
all_sources = new_sources
chunk_embeddings = np.array(new_embeddings)
else:
all_chunks.extend(new_chunks)
all_sources.extend(new_sources)
chunk_embeddings = np.vstack([chunk_embeddings, new_embeddings])
# Step 6 — save
save_updated_cache()
# -------------------------
# Load or build embeddings
# -------------------------
all_chunks = []
all_sources = []
if cache_is_valid():
status = cache_is_valid()
# if cache_is_valid():
if status["valid"]:
print("Loading embeddings from cache...")
data = np.load(CACHE_FILE, allow_pickle=True)
chunk_embeddings = data["embeddings"]
all_chunks = list(data["chunks"])
all_sources = list(data["sources"])
print(f"Total chunks loaded from cache: {len(all_chunks)}")
# check if we have additions or modifications
if status["added"] or status["modified"]:
print("\n[Incremental update triggered]")
process_incremental_updates(status)
else:
print("Building embeddings from scratch...")
for book_name in book_files: