Reorganize the corpus. Prevent answers from being truncated mid-sentence.

This commit is contained in:
2026-04-06 12:24:05 -04:00
parent 2e3dd2fff0
commit cb7d906ef7
9 changed files with 26 additions and 10 deletions

View File

@@ -119,14 +119,14 @@ conversation_history = []
LEVELS = {
1: {"expand": False, "top_k": 1, "max_tokens": 75, "context_len": 500},
2: {"expand": False, "top_k": 1, "max_tokens": 75, "context_len": 600},
3: {"expand": False, "top_k": 2, "max_tokens": 100, "context_len": 700},
4: {"expand": False, "top_k": 2, "max_tokens": 100, "context_len": 800},
5: {"expand": False, "top_k": 3, "max_tokens": 125, "context_len": 1000},
6: {"expand": False, "top_k": 3, "max_tokens": 150, "context_len": 1200},
7: {"expand": True, "top_k": 3, "max_tokens": 150, "context_len": 1400},
8: {"expand": True, "top_k": 4, "max_tokens": 175, "context_len": 1600},
9: {"expand": True, "top_k": 5, "max_tokens": 175, "context_len": 1800},
10: {"expand": True, "top_k": 5, "max_tokens": 200, "context_len": 2000},
3: {"expand": False, "top_k": 2, "max_tokens": 100*3, "context_len": 700},
4: {"expand": False, "top_k": 2, "max_tokens": 100*3, "context_len": 800},
5: {"expand": False, "top_k": 3, "max_tokens": 125*3, "context_len": 1000},
6: {"expand": False, "top_k": 3, "max_tokens": 150*3, "context_len": 1200},
7: {"expand": True, "top_k": 3, "max_tokens": 150*3, "context_len": 1400},
8: {"expand": True, "top_k": 4, "max_tokens": 175*3, "context_len": 1600},
9: {"expand": True, "top_k": 5, "max_tokens": 175*3, "context_len": 1800},
10: {"expand": True, "top_k": 5, "max_tokens": 200*3, "context_len": 2000},
}
# -------------------------
@@ -569,6 +569,21 @@ def parse_input(user_input):
return question, filter_term
return user_input, SEARCH_FILTER
# --------------------------
# Truncate context at a sentence boundary to avoid feeding the LLM incomplete fragments
# -----------------------------
def truncate_at_sentence(text, max_chars):
if len(text) <= max_chars:
return text
truncated = text[:max_chars]
last_period = max(
truncated.rfind('.'),
truncated.rfind('!'),
truncated.rfind('?')
)
return truncated[:last_period + 1] if last_period > 0 else truncated
# -------------------------
# Ask question
# -------------------------
@@ -585,7 +600,8 @@ def ask_question(question, show_sources=False, filter_term=None):
print(chunk[:300])
print("--- End chunks ---\n")
context = " ".join(top_chunks)[:level_cfg["context_len"]]
# context = " ".join(top_chunks)[:level_cfg["context_len"]]
context = truncate_at_sentence(" ".join(top_chunks), level_cfg["context_len"])
# Build conversation history string
history_text = ""
@@ -604,7 +620,7 @@ def ask_question(question, show_sources=False, filter_term=None):
f"Do not reference outside sources. "
f"Do not repeat or echo the conversation history in your answer. "
f"Do not include labels, separator lines, or notes in your answer. "
f"Stop immediately after answering.\n\n"
f"Stop immediately after answering, ending on a complete sentence."
)
if history_text: