Add books

This commit is contained in:
2026-04-04 22:56:29 -04:00
parent 3ccc529045
commit 535e07a61c
3 changed files with 10303 additions and 23 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -80,13 +80,10 @@ import torch
# Semantic chunking
# Better table handling
# -------------------------
# Knowledge base selection
# -------------------------
BOOK_DIR = 'Books/History' # just a string
BOOK_DIR = 'Books/Religeon' # just a string
book_files = []
for f in Path(BOOK_DIR).rglob('*'):
@@ -377,57 +374,71 @@ def topics_are_related(question, history, lookback=3):
if not history:
return False
# Very short questions with pronouns are almost certainly follow-ups
q_lower = question.lower()
# Very short questions with pronouns are almost certainly follow-ups
pronoun_followups = {
"he","she","they","him","her","them","his","it",
"this","that","these","those","who","what","where","when"
}
q_words_all = set(q_lower.replace('?','').replace('.','').split())
if len(q_words_all) <= 5 and q_words_all & pronoun_followups:
print(f" [Pronoun follow-up detected — enriching]")
return True
# Get meaningful words from current question
q_words = set(q_lower.split()) - STOPWORDS
if not q_words:
return False
q_words = set(q_lower.replace('?','').replace('.','').split()) - STOPWORDS
# Get words from recent history questions
recent = history[-lookback:]
history_words = set()
for exchange in recent:
history_words.update(exchange["question"].lower().split())
history_words.update(
exchange["question"].lower().replace('?','').replace('.','').split()
)
history_words -= STOPWORDS
# Check overlap
# Pronoun follow-up check — only if history has meaningful content
pronoun_followups = {
"he","she","they","him","her","them","his","it",
"this","that","these","those"
}
q_words_all = set(q_lower.replace('?','').replace('.','').split())
if len(q_words_all) <= 5 and q_words_all & pronoun_followups:
if history_words:
print(f" [Pronoun follow-up detected — enriching]")
return True
if not q_words:
return False
# Check meaningful word overlap
overlap = len(q_words & history_words)
print(f" [Topic overlap: {overlap} word(s)]")
return overlap > 0
def enrich_query_with_history(question):
"""
Add context from recent history to improve retrieval
for short follow-up questions.
Skips enrichment if topic has shifted or enriched query is too long.
"""
if not conversation_history:
return question
if len(question.split()) >= 6:
# Only enrich questions under 8 words
if len(question.split()) >= 8:
return question
# Check if topic has shifted
if not topics_are_related(question, conversation_history):
print(f" [Topic shift detected — no enrichment]")
return question
# Look back up to 3 exchanges for context
recent = conversation_history[-3:]
context_words = " ".join([ex["question"] for ex in recent])
enriched = f"{context_words} {question}"
# Don't enrich if result is too long — it will overwhelm the question
# Don't enrich if result is too long
if len(enriched.split()) > 30:
print(f" [Enriched query too long — using original]")
return question
print(f" [Enriched query: {enriched}]")
return enriched
# -------------------------
# Retrieve top relevant chunks
# -------------------------