Add books

This commit is contained in:
2026-04-04 22:56:29 -04:00
parent 3ccc529045
commit 535e07a61c
3 changed files with 10303 additions and 23 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -80,13 +80,10 @@ import torch
# Semantic chunking # Semantic chunking
# Better table handling # Better table handling
# ------------------------- # -------------------------
# Knowledge base selection # Knowledge base selection
# ------------------------- # -------------------------
BOOK_DIR = 'Books/History' # just a string BOOK_DIR = 'Books/Religeon' # just a string
book_files = [] book_files = []
for f in Path(BOOK_DIR).rglob('*'): for f in Path(BOOK_DIR).rglob('*'):
@@ -377,57 +374,71 @@ def topics_are_related(question, history, lookback=3):
if not history: if not history:
return False return False
# Very short questions with pronouns are almost certainly follow-ups
q_lower = question.lower() q_lower = question.lower()
# Very short questions with pronouns are almost certainly follow-ups
pronoun_followups = {
"he","she","they","him","her","them","his","it",
"this","that","these","those","who","what","where","when"
}
q_words_all = set(q_lower.replace('?','').replace('.','').split())
if len(q_words_all) <= 5 and q_words_all & pronoun_followups:
print(f" [Pronoun follow-up detected — enriching]")
return True
# Get meaningful words from current question # Get meaningful words from current question
q_words = set(q_lower.split()) - STOPWORDS q_words = set(q_lower.replace('?','').replace('.','').split()) - STOPWORDS
if not q_words:
return False
# Get words from recent history questions # Get words from recent history questions
recent = history[-lookback:] recent = history[-lookback:]
history_words = set() history_words = set()
for exchange in recent: for exchange in recent:
history_words.update(exchange["question"].lower().split()) history_words.update(
exchange["question"].lower().replace('?','').replace('.','').split()
)
history_words -= STOPWORDS history_words -= STOPWORDS
# Pronoun follow-up check — only if history has meaningful content
pronoun_followups = {
"he","she","they","him","her","them","his","it",
"this","that","these","those"
}
q_words_all = set(q_lower.replace('?','').replace('.','').split())
# Check overlap if len(q_words_all) <= 5 and q_words_all & pronoun_followups:
if history_words:
print(f" [Pronoun follow-up detected — enriching]")
return True
if not q_words:
return False
# Check meaningful word overlap
overlap = len(q_words & history_words) overlap = len(q_words & history_words)
print(f" [Topic overlap: {overlap} word(s)]") print(f" [Topic overlap: {overlap} word(s)]")
return overlap > 0 return overlap > 0
def enrich_query_with_history(question): def enrich_query_with_history(question):
"""
Add context from recent history to improve retrieval
for short follow-up questions.
Skips enrichment if topic has shifted or enriched query is too long.
"""
if not conversation_history: if not conversation_history:
return question return question
if len(question.split()) >= 6:
# Only enrich questions under 8 words
if len(question.split()) >= 8:
return question return question
# Check if topic has shifted
if not topics_are_related(question, conversation_history): if not topics_are_related(question, conversation_history):
print(f" [Topic shift detected — no enrichment]") print(f" [Topic shift detected — no enrichment]")
return question return question
# Look back up to 3 exchanges for context
recent = conversation_history[-3:] recent = conversation_history[-3:]
context_words = " ".join([ex["question"] for ex in recent]) context_words = " ".join([ex["question"] for ex in recent])
enriched = f"{context_words} {question}" enriched = f"{context_words} {question}"
# Don't enrich if result is too long — it will overwhelm the question # Don't enrich if result is too long
if len(enriched.split()) > 30: if len(enriched.split()) > 30:
print(f" [Enriched query too long — using original]") print(f" [Enriched query too long — using original]")
return question return question
print(f" [Enriched query: {enriched}]") print(f" [Enriched query: {enriched}]")
return enriched return enriched
# ------------------------- # -------------------------
# Retrieve top relevant chunks # Retrieve top relevant chunks
# ------------------------- # -------------------------