Appendix B: Implementation Details
This appendix provides key implementation details for reproducing the Memory Palace system.
Confidence Scoring Implementation
import numpy as np
from dataclasses import dataclass
from typing import List
@dataclass
class Memory:
id: str
subject: str
image: str
content: str
verify_token: str
smashin_score: int # 0-12
embedding: List[float]
def calculate_retrieval_score(
memory: Memory,
query: str,
query_embedding: List[float],
response: str,
alpha: float = 0.5,
beta: float = 0.3,
gamma: float = 0.2
) -> float:
"""
Calculate retrieval confidence score.
score = α * sim(m, q) + β * verify(m) + γ * smashin(m)
where:
- sim(m, q) = cosine similarity between memory and query embeddings
- verify(m) = 1 if verification token present in response, 0 otherwise
- smashin(m) = normalized SMASHIN SCOPE score (0-1)
"""
# Semantic similarity
similarity = cosine_similarity(memory.embedding, query_embedding)
# Verification token check
verify_score = 1.0 if memory.verify_token.lower() in response.lower() else 0.0
# SMASHIN SCOPE encoding quality
smashin_normalized = memory.smashin_score / 12.0
return alpha * similarity + beta * verify_score + gamma * smashin_normalized
def cosine_similarity(a: List[float], b: List[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))Hierarchical Index Implementation
import json
from typing import Dict, List, Optional, Tuple
class HierarchicalIndex:
"""Three-level hierarchical memory index."""
def __init__(self, root_path: str):
self.root_path = root_path
self.root_index: Dict[str, str] = {} # keyword -> domain
self.domain_indices: Dict[str, Dict] = {} # domain -> {topic: location}
self._load_indices()
def retrieve(self, query: str, k: int = 3) -> Tuple[List[dict], int]:
"""
2-hop retrieval: keyword -> domain -> memories
Returns: (memories, context_size_bytes)
"""
context_size = 0
# Hop 1: Find domain from keywords
keywords = self._extract_keywords(query)
domain = None
for kw in keywords:
if kw.lower() in self.root_index:
domain = self.root_index[kw.lower()]
break
if not domain:
domain = self._semantic_domain_match(query)
context_size += len(str(self.root_index))
# Hop 2: Find memories within domain
domain_index = self.domain_indices.get(domain, {})
context_size += len(str(domain_index))
# Find top-k matching memories
candidates = []
for topic, location in domain_index.items():
score = self._score_match(query, topic)
candidates.append((score, topic, location))
candidates.sort(reverse=True)
top_k = candidates[:k]
# Load actual memories
memories = []
for score, topic, location in top_k:
memory = self._load_memory(location)
memories.append(memory)
context_size += len(str(memory))
return memories, context_sizeVerification Token Checker
import re
from typing import Tuple, Optional
class VerificationChecker:
"""Check LLM responses for verification tokens."""
def __init__(self, strict_mode: bool = True):
self.strict_mode = strict_mode
def extract_token(self, memory_image: str) -> Optional[str]:
"""Extract verification token from memory image."""
match = re.search(r'\[Verify:\s*([^\]]+)\]', memory_image)
return match.group(1).strip() if match else None
def check_response(self, response: str, expected_token: str) -> Tuple[bool, str]:
"""
Check if response contains the expected verification token.
Returns: (is_valid, explanation)
"""
if not expected_token:
return True, "No verification token required"
response_lower = response.lower()
token_lower = expected_token.lower()
if token_lower in response_lower:
return True, f"Verification token '{expected_token}' found"
return False, f"HALLUCINATION SUSPECTED: Token '{expected_token}' not found"Red Queen Protocol
from enum import Enum
from dataclasses import dataclass
from typing import List
class Strategy(Enum):
RANDOM = "random"
WEAK_SPOTS = "weak-spots"
DEPTH_FIRST = "depth-first"
ADVERSARIAL = "adversarial"
@dataclass
class Question:
memory_id: str
question_text: str
difficulty: str
expected_elements: List[str]
@dataclass
class Evaluation:
memory_id: str
score: float
gaps: List[str]
should_evolve: bool
async def run_red_queen(
palace: dict,
strategy: Strategy = Strategy.WEAK_SPOTS,
question_count: int = 10
) -> List[Evaluation]:
"""
Run adversarial testing protocol.
1. Examiner generates questions
2. Learner attempts blind recall
3. Evaluator scores and identifies gaps
4. Evolver strengthens weak memories
"""
memories = select_memories(palace, strategy, question_count)
questions = await generate_questions(memories, strategy)
answers = await attempt_recall(questions, anchors_only=True)
evaluations = await evaluate_answers(questions, answers, ground_truth=memories)
weak_memories = [e for e in evaluations if e.should_evolve]
if weak_memories:
await strengthen_memories(weak_memories, palace)
return evaluationsRunning Benchmarks
# Setup environment
cd paper/code
python -m venv .venv
source .venv/bin/activate
pip install numpy pandas matplotlib requests
# Run local Ollama benchmark
python ollama_benchmark.py
# Run cloud Gemini benchmark
export GEMINI_API_KEY=your_key_here
python gemini_benchmark.py
# Generate visualizations
python visualize_results.py
# Run SOTA comparison
python sota_comparison.pyAll benchmark code and results are available in the paper/code/ and paper/results/ directories.