Incremental updates
This commit is contained in:
Binary file not shown.
@@ -1 +1 @@
|
||||
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt", "Books\\Accounting\\Oracle\\Oracle.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241, "Books\\Accounting\\Oracle\\Oracle.txt": 262504}}
|
||||
{"book_files": ["Books\\Accounting\\Accounting\\Account.txt", "Books\\Accounting\\Accounting\\BankFeed.txt", "Books\\Accounting\\Accounting\\BillingChange.txt", "Books\\Accounting\\Accounting\\CashReceipts.txt", "Books\\Accounting\\Accounting\\Common.txt", "Books\\Accounting\\Accounting\\Endorsement.txt", "Books\\Accounting\\Accounting\\Fifo.txt", "Books\\Accounting\\Accounting\\Integrations.txt", "Books\\Accounting\\Accounting\\PayoffBalance.txt", "Books\\Accounting\\Accounting\\Policy.txt", "Books\\Accounting\\Accounting\\Quote.txt", "Books\\Accounting\\Accounting\\Repository.txt", "Books\\Accounting\\Accounting\\Services.txt", "Books\\Accounting\\Accounting\\Starter.txt", "Books\\Accounting\\Accounting\\StructuredTags.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt", "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt", "Books\\Accounting\\Jira\\IA-2691.txt", "Books\\Accounting\\Jira\\IA-2698.txt", "Books\\Accounting\\Jira\\IA-2827.txt", "Books\\Accounting\\Jira\\IA-2852.txt", "Books\\Accounting\\Jira\\IA-2854.txt", "Books\\Accounting\\Jira\\IA-2855.txt", "Books\\Accounting\\Jira\\IA-2858.txt", "Books\\Accounting\\Jira\\IA-2866.txt"], "file_sizes": {"Books\\Accounting\\Accounting\\Account.txt": 7587, "Books\\Accounting\\Accounting\\BankFeed.txt": 14170, "Books\\Accounting\\Accounting\\BillingChange.txt": 9648, "Books\\Accounting\\Accounting\\CashReceipts.txt": 9208, "Books\\Accounting\\Accounting\\Common.txt": 7444, "Books\\Accounting\\Accounting\\Endorsement.txt": 10019, "Books\\Accounting\\Accounting\\Fifo.txt": 9201, "Books\\Accounting\\Accounting\\Integrations.txt": 8451, "Books\\Accounting\\Accounting\\PayoffBalance.txt": 8413, "Books\\Accounting\\Accounting\\Policy.txt": 11662, "Books\\Accounting\\Accounting\\Quote.txt": 6815, "Books\\Accounting\\Accounting\\Repository.txt": 7395, "Books\\Accounting\\Accounting\\Services.txt": 6633, "Books\\Accounting\\Accounting\\Starter.txt": 5075, "Books\\Accounting\\Accounting\\StructuredTags.txt": 5606, "Books\\Accounting\\AccountsReceivable\\AccountsReceivable.txt": 24315, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableAdjustments.txt": 22960, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceiptApplications.txt": 11447, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCashReceipts.txt": 18039, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCommon.txt": 21206, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoApplications.txt": 12886, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemoRefunds.txt": 0, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableCreditMemos.txt": 7669, "Books\\Accounting\\AccountsReceivable\\AccountsReceivableInvoiceInstallments.txt": 8241, "Books\\Accounting\\Jira\\IA-2691.txt": 8131, "Books\\Accounting\\Jira\\IA-2698.txt": 6072, "Books\\Accounting\\Jira\\IA-2827.txt": 7218, "Books\\Accounting\\Jira\\IA-2852.txt": 5870, "Books\\Accounting\\Jira\\IA-2854.txt": 6748, "Books\\Accounting\\Jira\\IA-2855.txt": 6776, "Books\\Accounting\\Jira\\IA-2858.txt": 5925, "Books\\Accounting\\Jira\\IA-2866.txt": 4241}}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
230
Chartwell.py
230
Chartwell.py
@@ -141,20 +141,6 @@ MODES = {
|
||||
"End your response with a single period."
|
||||
)
|
||||
},
|
||||
# "research": {
|
||||
# "print_msg": "Research mode.",
|
||||
# "prompt_instruction": (
|
||||
# "You are a helpful research assistant. "
|
||||
# "Restrict your response strictly to the provided context. "
|
||||
# "If the source material is exhausted, stop writing. "
|
||||
# "If a relationship or entity is not explicitly documented in the context, do not include it. "
|
||||
# # "Do not infer, supplement, or use external training knowledge. "
|
||||
# "Be direct and concise. "
|
||||
# "Never repeat the context or instructions. "
|
||||
# "Never echo the question. "
|
||||
# "End your answer with a single period. "
|
||||
# )
|
||||
# },
|
||||
|
||||
"research": {
|
||||
"print_msg": "Research mode.",
|
||||
@@ -162,31 +148,19 @@ MODES = {
|
||||
"You are a helpful research assistant. "
|
||||
"Restrict your response strictly to the provided context. "
|
||||
"If the source material is exhausted, stop writing. "
|
||||
# "If the context does not contain enough information to fully answer, provide only what is explicitly supported or say you don't know. "
|
||||
"If a relationship or entity is not explicitly documented in the context, do not include it. "
|
||||
"Do not repeat the same information in different wording. "
|
||||
"If multiple context passages express the same idea, summarize it once. "
|
||||
"If the context contains repetitive legal or procedural text, merge it into a single concise statement. "
|
||||
"Do not list multiple similar verses. "
|
||||
"Prefer one coherent explanation over multiple extracted quotations. "
|
||||
# "Do not infer, supplement, or use external training knowledge. "
|
||||
"Do not infer, guess, or use external knowledge under any circumstances. "
|
||||
"If the answer is not explicitly stated in the context, respond with: \"I don't know based on the provided context.\""
|
||||
# "Be direct and concise. "
|
||||
"Never repeat the context or instructions. "
|
||||
"Never echo the question. "
|
||||
"End your answer with a single period. "
|
||||
)
|
||||
},
|
||||
|
||||
# "advanced": {
|
||||
# "print_msg": "Advanced mode.",
|
||||
# "prompt_instruction": (
|
||||
# "You are adept at mathematics and computer programming. "
|
||||
# "You are a linguist able to put together complex ideas and work with formulations and workflows."
|
||||
# )
|
||||
# },
|
||||
|
||||
"advanced": {
|
||||
"print_msg": "Advanced mode.",
|
||||
"prompt_instruction": (
|
||||
@@ -420,19 +394,6 @@ def clean_text(text):
|
||||
# This is for the enrichment pipeline if it is enabled. It Uses the local LLM to extract key metadata from a chunk.
|
||||
# --------------------------------
|
||||
|
||||
# PROMPT_TEMPLATE = (
|
||||
# "<|start_header_id|>user<|end_header_id|>\n"
|
||||
# "Extract tags from the text below. Respond ONLY in this exact format: "
|
||||
# "[Time: | Loc: | Entity: | Topic:]\n"
|
||||
# # "[Time: | Loc: | Entity: | Theme:]\n"
|
||||
# "ALL fields must be filled. If uncertain, make a best guess.\n"
|
||||
# "Limit to 5 most important entities. No explanation.\n"
|
||||
# "Text: {text}\n"
|
||||
# "Tags: [<|eot_id|>\n"
|
||||
# "<|start_header_id|>assistant<|end_header_id|>\n"
|
||||
# )
|
||||
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
"<|start_header_id|>user<|end_header_id|>\n"
|
||||
"TAGGING OPERATION. NOT A CONVERSATION. NO EXPLANATIONS.\n"
|
||||
@@ -585,30 +546,21 @@ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
|
||||
# -------------------------
|
||||
# Check if cache is valid
|
||||
# -------------------------
|
||||
# def cache_is_valid():
|
||||
# print (f"Checking for existing enriched cache in {BOOK_DIR}...")
|
||||
# if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
|
||||
# return False
|
||||
# with open(CACHE_META, "r") as f:
|
||||
# meta = json.load(f)
|
||||
# if meta.get("book_files") != book_files:
|
||||
# return False
|
||||
# for book_name in book_files:
|
||||
# if not os.path.exists(book_name):
|
||||
# continue
|
||||
# stored_size = meta.get("file_sizes", {}).get(book_name)
|
||||
# actual_size = os.path.getsize(book_name)
|
||||
# if stored_size != actual_size:
|
||||
# return False
|
||||
# return True
|
||||
|
||||
def cache_is_valid():
|
||||
print(f"\nChecking for existing enriched cache in {BOOK_DIR}...")
|
||||
|
||||
# --- HARD FAIL: missing cache ---
|
||||
status = {
|
||||
"valid": True,
|
||||
"added": [],
|
||||
"modified": [],
|
||||
"missing_embeddings": []
|
||||
}
|
||||
|
||||
# --- HARD FAIL: missing cache files ---
|
||||
if not os.path.exists(CACHE_FILE) or not os.path.exists(CACHE_META):
|
||||
print("X Missing cache or metadata → rebuild required")
|
||||
return False
|
||||
status["valid"] = False
|
||||
return status
|
||||
|
||||
with open(CACHE_META, "r") as f:
|
||||
meta = json.load(f)
|
||||
@@ -616,55 +568,156 @@ def cache_is_valid():
|
||||
cached_files = set(meta.get("book_files", []))
|
||||
current_files = set(book_files)
|
||||
|
||||
added = current_files - cached_files
|
||||
|
||||
missing_embeddings = []
|
||||
modified_files = []
|
||||
# --- Detect NEW files ---
|
||||
status["added"] = list(current_files - cached_files)
|
||||
|
||||
# --- Check EXISTING files ---
|
||||
for book_name in current_files:
|
||||
if not os.path.exists(book_name):
|
||||
continue
|
||||
|
||||
# NEW FILE → ignore for now
|
||||
# Skip new files (handled separately)
|
||||
if book_name not in cached_files:
|
||||
continue
|
||||
|
||||
stored_size = meta.get("file_sizes", {}).get(book_name)
|
||||
actual_size = os.path.getsize(book_name)
|
||||
|
||||
# EXISTING FILE but missing metadata → BAD
|
||||
# Missing metadata entry → bad
|
||||
if stored_size is None:
|
||||
missing_embeddings.append(book_name)
|
||||
status["missing_embeddings"].append(book_name)
|
||||
continue
|
||||
|
||||
# EXISTING FILE but changed → BAD
|
||||
# File changed → needs reprocessing
|
||||
if stored_size != actual_size:
|
||||
modified_files.append(book_name)
|
||||
status["modified"].append(book_name)
|
||||
|
||||
# --- HARD FAIL CONDITIONS ---
|
||||
if missing_embeddings:
|
||||
print(f"\nX Missing embeddings for {len(missing_embeddings)} file(s):")
|
||||
for f in sorted(missing_embeddings):
|
||||
if status["missing_embeddings"]:
|
||||
print(f"\nX Missing embeddings for {len(status['missing_embeddings'])} file(s):")
|
||||
for f in sorted(status["missing_embeddings"]):
|
||||
print(f" * {f}")
|
||||
print("→ Rebuild required")
|
||||
return False
|
||||
status["valid"] = False
|
||||
|
||||
if modified_files:
|
||||
print(f"\nX {len(modified_files)} file(s) changed:")
|
||||
for f in sorted(modified_files):
|
||||
if status["modified"]:
|
||||
print(f"\nX {len(status['modified'])} file(s) changed:")
|
||||
for f in sorted(status["modified"]):
|
||||
print(f" * {f}")
|
||||
print("→ Rebuild required")
|
||||
return False
|
||||
|
||||
# --- SOFT WARNING ---
|
||||
if added:
|
||||
print(f"\n+ {len(added)} new file(s) detected (not yet embedded):")
|
||||
for f in sorted(added):
|
||||
if status["added"]:
|
||||
print(f"\n+ {len(status['added'])} new file(s) detected:")
|
||||
for f in sorted(status["added"]):
|
||||
print(f" + {f}")
|
||||
print("→ Continuing with existing cache (new files will be ignored)")
|
||||
|
||||
print("\n✓ Cache usable")
|
||||
return True
|
||||
if status["valid"]:
|
||||
print("\n✓ Cache usable (incremental updates possible)")
|
||||
else:
|
||||
print("\n→ Full rebuild required")
|
||||
|
||||
return status
|
||||
|
||||
# --------------------------------------------------
|
||||
# Save updated cache file
|
||||
# --------------------------------------------------
|
||||
def save_updated_cache():
|
||||
np.savez(
|
||||
CACHE_FILE,
|
||||
embeddings=chunk_embeddings,
|
||||
chunks=np.array(all_chunks, dtype=object),
|
||||
sources=np.array(all_sources, dtype=object)
|
||||
)
|
||||
|
||||
file_sizes = {b: os.path.getsize(b) for b in book_files if os.path.exists(b)}
|
||||
|
||||
with open(CACHE_META, "w") as f:
|
||||
json.dump({
|
||||
"book_files": book_files,
|
||||
"file_sizes": file_sizes
|
||||
}, f)
|
||||
|
||||
print("Cache updated.")
|
||||
|
||||
# --------------------------------------------------
|
||||
# Remove chunks from embeddings for specified files
|
||||
# --------------------------------------------------
|
||||
def remove_chunks_for_files(files_to_remove):
|
||||
global all_chunks, all_sources, chunk_embeddings
|
||||
|
||||
if not files_to_remove:
|
||||
return
|
||||
|
||||
keep_indices = [
|
||||
i for i, src in enumerate(all_sources)
|
||||
if src not in files_to_remove
|
||||
]
|
||||
|
||||
all_chunks = [all_chunks[i] for i in keep_indices]
|
||||
all_sources = [all_sources[i] for i in keep_indices]
|
||||
chunk_embeddings = chunk_embeddings[keep_indices]
|
||||
|
||||
print(f"Removed old chunks for {len(files_to_remove)} modified file(s)")
|
||||
|
||||
# -------------------------
|
||||
# Process new and modified files
|
||||
# -------------------------
|
||||
def process_incremental_updates(status):
|
||||
global all_chunks, all_sources, chunk_embeddings
|
||||
|
||||
files_to_process = status["added"] + status["modified"]
|
||||
|
||||
# Step 1 — remove outdated chunks (ONLY modified files)
|
||||
remove_chunks_for_files(status["modified"])
|
||||
|
||||
new_chunks = []
|
||||
new_sources = []
|
||||
|
||||
# Step 2 — process new + modified files
|
||||
for book_name in files_to_process:
|
||||
print(f"[Updating] {book_name}")
|
||||
|
||||
with open(book_name, "rb") as f:
|
||||
raw = f.read()
|
||||
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except:
|
||||
text = raw.decode("cp1252")
|
||||
|
||||
# Skip IGNORE files
|
||||
first_line = text.lstrip().splitlines()[0] if text.strip() else ""
|
||||
if first_line.strip().upper().startswith("# IGNORE"):
|
||||
print(f"Skipping {book_name} (marked IGNORE)")
|
||||
continue
|
||||
|
||||
book_text = clean_text(text)
|
||||
chunks = chunk_text(book_text)
|
||||
|
||||
new_chunks.extend(chunks)
|
||||
new_sources.extend([book_name] * len(chunks))
|
||||
|
||||
# Step 3 — nothing to add
|
||||
if not new_chunks:
|
||||
print("No new chunks to add.")
|
||||
return
|
||||
|
||||
# Step 4 — embed
|
||||
print(f"Embedding {len(new_chunks)} new chunks...")
|
||||
new_embeddings = embed_model.encode(new_chunks, convert_to_tensor=False)
|
||||
|
||||
# Step 5 — append
|
||||
if len(all_chunks) == 0:
|
||||
# edge case: empty cache
|
||||
all_chunks = new_chunks
|
||||
all_sources = new_sources
|
||||
chunk_embeddings = np.array(new_embeddings)
|
||||
else:
|
||||
all_chunks.extend(new_chunks)
|
||||
all_sources.extend(new_sources)
|
||||
chunk_embeddings = np.vstack([chunk_embeddings, new_embeddings])
|
||||
|
||||
# Step 6 — save
|
||||
save_updated_cache()
|
||||
|
||||
# -------------------------
|
||||
# Load or build embeddings
|
||||
@@ -672,13 +725,22 @@ def cache_is_valid():
|
||||
all_chunks = []
|
||||
all_sources = []
|
||||
|
||||
if cache_is_valid():
|
||||
status = cache_is_valid()
|
||||
|
||||
# if cache_is_valid():
|
||||
if status["valid"]:
|
||||
print("Loading embeddings from cache...")
|
||||
data = np.load(CACHE_FILE, allow_pickle=True)
|
||||
chunk_embeddings = data["embeddings"]
|
||||
all_chunks = list(data["chunks"])
|
||||
all_sources = list(data["sources"])
|
||||
print(f"Total chunks loaded from cache: {len(all_chunks)}")
|
||||
|
||||
# check if we have additions or modifications
|
||||
if status["added"] or status["modified"]:
|
||||
print("\n[Incremental update triggered]")
|
||||
process_incremental_updates(status)
|
||||
|
||||
else:
|
||||
print("Building embeddings from scratch...")
|
||||
for book_name in book_files:
|
||||
|
||||
Reference in New Issue
Block a user