Incremental updates

2026-04-25 08:57:38 -04:00
parent 72cfbe841f
commit f7d0978148
5 changed files with 149 additions and 87 deletions
--- a/Books/Accounting/embeddings_cache.npz
+++ b/Books/Accounting/embeddings_cache.npz
--- a/Books/Accounting/embeddings_cache_meta.json
+++ b/Books/Accounting/embeddings_cache_meta.json
@@ -1 +1 @@
-{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt", "Books\\Accounting\\Oracle\\Oracle.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241, "Books\\Accounting\\Oracle\\Oracle.txt": 262504}}
+{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241}}
--- a/Books/Music/embeddings_cache.npz
+++ b/Books/Music/embeddings_cache.npz
--- a/Books/Music/embeddings_cache_meta.json
+++ b/Books/Music/embeddings_cache_meta.json
--- a/Chartwell.py
+++ b/Chartwell.py
@@ -141,20 +141,6 @@ MODES = {
            "End your response with a single period."
        )
    },
-#     "research": {
-#         "print_msg": "Research mode.",
-#         "prompt_instruction": (
-#             "You are a helpful research assistant. "
-#             "Restrict your response strictly to the provided context. "
-#             "If the source material is exhausted, stop writing. "
-#             "If a relationship or entity is not explicitly documented in the context, do not include it. "
-# #            "Do not infer, supplement, or use external training knowledge. "
-#             "Be direct and concise. "
-#             "Never repeat the context or instructions. "
-#             "Never echo the question. "
-#             "End your answer with a single period. "
-#         )
-#     },

    "research": {
        "print_msg": "Research mode.",
@@ -162,31 +148,19 @@ MODES = {
            "You are a helpful research assistant. "
            "Restrict your response strictly to the provided context. "
            "If the source material is exhausted, stop writing. "
-#            "If the context does not contain enough information to fully answer, provide only what is explicitly supported or say you don't know. "
            "If a relationship or entity is not explicitly documented in the context, do not include it. "
            "Do not repeat the same information in different wording. "
            "If multiple context passages express the same idea, summarize it once. "
            "If the context contains repetitive legal or procedural text, merge it into a single concise statement. "
            "Do not list multiple similar verses. "
            "Prefer one coherent explanation over multiple extracted quotations. "
-#            "Do not infer, supplement, or use external training knowledge. "
            "Do not infer, guess, or use external knowledge under any circumstances. "
-            "If the answer is not explicitly stated in the context, respond with: \"I don't know based on the provided context.\""
-#            "Be direct and concise. "
            "Never repeat the context or instructions. "
            "Never echo the question. "
            "End your answer with a single period. "
        )
    },
-    
-    # "advanced": {
-    #     "print_msg": "Advanced mode.",
-    #     "prompt_instruction": (
-    #         "You are adept at mathematics and computer programming. "
-    #         "You are a linguist able to put together complex ideas and work with formulations and workflows."
-    #     )
-    # },
-    
+   
    "advanced": {
        "print_msg": "Advanced mode.",
        "prompt_instruction": (
@@ -420,19 +394,6 @@ def clean_text(text):
 # This is for the enrichment pipeline if it is enabled.  It Uses the local LLM to extract key metadata from a chunk.
 # --------------------------------

-# PROMPT_TEMPLATE = (
-#     "<|start_header_id|>user<|end_header_id|>\n"
-#     "Extract tags from the text below. Respond ONLY in this exact format: "
-#     "[Time: | Loc: | Entity: | Topic:]\n"
-# #    "[Time: | Loc: | Entity: | Theme:]\n"
-#     "ALL fields must be filled. If uncertain, make a best guess.\n"
-#     "Limit to 5 most important entities. No explanation.\n"
-#     "Text: {text}\n"
-#     "Tags: [<|eot_id|>\n"
-#     "<|start_header_id|>assistant<|end_header_id|>\n"
-# )
-
-
 PROMPT_TEMPLATE = (
    "<|start_header_id|>user<|end_header_id|>\n"
    "TAGGING OPERATION. NOT A CONVERSATION. NO EXPLANATIONS.\n"
@@ -585,30 +546,21 @@ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
 # -------------------------
 # Check if cache is valid
 # -------------------------
-# def cache_is_valid():
-#     print (f"Checking for existing enriched cache in {BOOK_DIR}...")
-#     if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
-#         return False
-#     with open(CACHE_META, "r") as f:
-#         meta = json.load(f)
-#     if meta.get("book_files") != book_files:
-#         return False
-#     for book_name in book_files:
-#         if not os.path.exists(book_name):
-#             continue
-#         stored_size = meta.get("file_sizes", {}).get(book_name)
-#         actual_size = os.path.getsize(book_name)
-#         if stored_size != actual_size:
-#             return False
-#     return True
-
 def cache_is_valid():
    print(f"\nChecking for existing enriched cache in {BOOK_DIR}...")

-    # --- HARD FAIL: missing cache ---
+    status = {
+        "valid": True,
+        "added": [],
+        "modified": [],
+        "missing_embeddings": []
+    }
+
+    # --- HARD FAIL: missing cache files ---
    if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
        print("X Missing cache or metadata → rebuild required")
-        return False
+        status["valid"] = False
+        return status

    with open(CACHE_META, "r") as f:
        meta = json.load(f)
@@ -616,69 +568,179 @@ def cache_is_valid():
    cached_files = set(meta.get("book_files", []))
    current_files = set(book_files)

-    added = current_files - cached_files
-
-    missing_embeddings = []
-    modified_files = []
+    # --- Detect NEW files ---
+    status["added"] = list(current_files - cached_files)

+    # --- Check EXISTING files ---
    for book_name in current_files:
        if not os.path.exists(book_name):
            continue

-        # NEW FILE → ignore for now
+        # Skip new files (handled separately)
        if book_name not in cached_files:
            continue

        stored_size = meta.get("file_sizes", {}).get(book_name)
        actual_size = os.path.getsize(book_name)

-        # EXISTING FILE but missing metadata → BAD
+        # Missing metadata entry → bad
        if stored_size is None:
-            missing_embeddings.append(book_name)
+            status["missing_embeddings"].append(book_name)
            continue

-        # EXISTING FILE but changed → BAD
+        # File changed → needs reprocessing
        if stored_size != actual_size:
-            modified_files.append(book_name)
+            status["modified"].append(book_name)

    # --- HARD FAIL CONDITIONS ---
-    if missing_embeddings:
-        print(f"\nX Missing embeddings for {len(missing_embeddings)} file(s):")
-        for f in sorted(missing_embeddings):
+    if status["missing_embeddings"]:
+        print(f"\nX Missing embeddings for {len(status['missing_embeddings'])} file(s):")
+        for f in sorted(status["missing_embeddings"]):
            print(f"   * {f}")
-        print("→ Rebuild required")
-        return False
+        status["valid"] = False

-    if modified_files:
-        print(f"\nX {len(modified_files)} file(s) changed:")
-        for f in sorted(modified_files):
+    if status["modified"]:
+        print(f"\nX {len(status['modified'])} file(s) changed:")
+        for f in sorted(status["modified"]):
            print(f"   * {f}")
-        print("→ Rebuild required")
-        return False

    # --- SOFT WARNING ---
-    if added:
-        print(f"\n+ {len(added)} new file(s) detected (not yet embedded):")
-        for f in sorted(added):
+    if status["added"]:
+        print(f"\n+ {len(status['added'])} new file(s) detected:")
+        for f in sorted(status["added"]):
            print(f"   + {f}")
-        print("→ Continuing with existing cache (new files will be ignored)")

-    print("\n✓ Cache usable")
-    return True
+    if status["valid"]:
+        print("\n✓ Cache usable (incremental updates possible)")
+    else:
+        print("\n→ Full rebuild required")

+    return status
+
+# --------------------------------------------------
+# Save updated cache file
+# --------------------------------------------------
+def save_updated_cache():
+    np.savez(
+        CACHE_FILE,
+        embeddings=chunk_embeddings,
+        chunks=np.array(all_chunks, dtype=object),
+        sources=np.array(all_sources, dtype=object)
+    )
+
+    file_sizes = {b: os.path.getsize(b) for b in book_files if os.path.exists(b)}
+
+    with open(CACHE_META, "w") as f:
+        json.dump({
+            "book_files": book_files,
+            "file_sizes": file_sizes
+        }, f)
+
+    print("Cache updated.")
+
+# --------------------------------------------------
+# Remove chunks from embeddings for specified files
+# --------------------------------------------------
+def remove_chunks_for_files(files_to_remove):
+    global all_chunks, all_sources, chunk_embeddings
+
+    if not files_to_remove:
+        return
+
+    keep_indices = [
+        i for i, src in enumerate(all_sources)
+        if src not in files_to_remove
+    ]
+
+    all_chunks = [all_chunks[i] for i in keep_indices]
+    all_sources = [all_sources[i] for i in keep_indices]
+    chunk_embeddings = chunk_embeddings[keep_indices]
+
+    print(f"Removed old chunks for {len(files_to_remove)} modified file(s)")
+
+# -------------------------
+# Process new and modified files 
+# -------------------------
+def process_incremental_updates(status):
+    global all_chunks, all_sources, chunk_embeddings
+
+    files_to_process = status["added"] + status["modified"]
+
+    # Step 1 — remove outdated chunks (ONLY modified files)
+    remove_chunks_for_files(status["modified"])
+
+    new_chunks = []
+    new_sources = []
+
+    # Step 2 — process new + modified files
+    for book_name in files_to_process:
+        print(f"[Updating] {book_name}")
+
+        with open(book_name, "rb") as f:
+            raw = f.read()
+
+        try:
+            text = raw.decode("utf-8")
+        except:
+            text = raw.decode("cp1252")
+
+        # Skip IGNORE files
+        first_line = text.lstrip().splitlines()[0] if text.strip() else ""
+        if first_line.strip().upper().startswith("# IGNORE"):
+            print(f"Skipping {book_name} (marked IGNORE)")
+            continue
+
+        book_text = clean_text(text)
+        chunks = chunk_text(book_text)
+
+        new_chunks.extend(chunks)
+        new_sources.extend([book_name] * len(chunks))
+
+    # Step 3 — nothing to add
+    if not new_chunks:
+        print("No new chunks to add.")
+        return
+
+    # Step 4 — embed
+    print(f"Embedding {len(new_chunks)} new chunks...")
+    new_embeddings = embed_model.encode(new_chunks, convert_to_tensor=False)
+
+    # Step 5 — append
+    if len(all_chunks) == 0:
+        # edge case: empty cache
+        all_chunks = new_chunks
+        all_sources = new_sources
+        chunk_embeddings = np.array(new_embeddings)
+    else:
+        all_chunks.extend(new_chunks)
+        all_sources.extend(new_sources)
+        chunk_embeddings = np.vstack([chunk_embeddings, new_embeddings])
+
+    # Step 6 — save
+    save_updated_cache()
+    
 # -------------------------
 # Load or build embeddings
 # -------------------------
 all_chunks = []
 all_sources = []

-if cache_is_valid():
+status = cache_is_valid()
+
+# if cache_is_valid():
+if status["valid"]:
    print("Loading embeddings from cache...")
    data = np.load(CACHE_FILE, allow_pickle=True)
    chunk_embeddings = data["embeddings"]
    all_chunks = list(data["chunks"])
    all_sources = list(data["sources"])
    print(f"Total chunks loaded from cache: {len(all_chunks)}")
+    
+# check if we have additions or modifications    
+    if status["added"] or status["modified"]:
+        print("\n[Incremental update triggered]")
+        process_incremental_updates(status)    
+    
 else:
    print("Building embeddings from scratch...")
    for book_name in book_files: