appendix-b-code

Appendix B: Implementation Details

This appendix provides key implementation details for reproducing the Memory Palace system.

Confidence Scoring Implementation

import numpy as np
from dataclasses import dataclass
from typing import List

@dataclass
class Memory:
    id: str
    subject: str
    image: str
    content: str
    verify_token: str
    smashin_score: int  # 0-12
    embedding: List[float]

def calculate_retrieval_score(
    memory: Memory,
    query: str,
    query_embedding: List[float],
    response: str,
    alpha: float = 0.5,
    beta: float = 0.3,
    gamma: float = 0.2
) -> float:
    """
    Calculate retrieval confidence score.

    score = α * sim(m, q) + β * verify(m) + γ * smashin(m)

    where:
    - sim(m, q) = cosine similarity between memory and query embeddings
    - verify(m) = 1 if verification token present in response, 0 otherwise
    - smashin(m) = normalized SMASHIN SCOPE score (0-1)
    """
    # Semantic similarity
    similarity = cosine_similarity(memory.embedding, query_embedding)

    # Verification token check
    verify_score = 1.0 if memory.verify_token.lower() in response.lower() else 0.0

    # SMASHIN SCOPE encoding quality
    smashin_normalized = memory.smashin_score / 12.0

    return alpha * similarity + beta * verify_score + gamma * smashin_normalized

def cosine_similarity(a: List[float], b: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

Hierarchical Index Implementation

import json
from typing import Dict, List, Optional, Tuple

class HierarchicalIndex:
    """Three-level hierarchical memory index."""

    def __init__(self, root_path: str):
        self.root_path = root_path
        self.root_index: Dict[str, str] = {}  # keyword -> domain
        self.domain_indices: Dict[str, Dict] = {}  # domain -> {topic: location}
        self._load_indices()

    def retrieve(self, query: str, k: int = 3) -> Tuple[List[dict], int]:
        """
        2-hop retrieval: keyword -> domain -> memories

        Returns: (memories, context_size_bytes)
        """
        context_size = 0

        # Hop 1: Find domain from keywords
        keywords = self._extract_keywords(query)
        domain = None

        for kw in keywords:
            if kw.lower() in self.root_index:
                domain = self.root_index[kw.lower()]
                break

        if not domain:
            domain = self._semantic_domain_match(query)

        context_size += len(str(self.root_index))

        # Hop 2: Find memories within domain
        domain_index = self.domain_indices.get(domain, {})
        context_size += len(str(domain_index))

        # Find top-k matching memories
        candidates = []
        for topic, location in domain_index.items():
            score = self._score_match(query, topic)
            candidates.append((score, topic, location))

        candidates.sort(reverse=True)
        top_k = candidates[:k]

        # Load actual memories
        memories = []
        for score, topic, location in top_k:
            memory = self._load_memory(location)
            memories.append(memory)
            context_size += len(str(memory))

        return memories, context_size

Verification Token Checker

import re
from typing import Tuple, Optional

class VerificationChecker:
    """Check LLM responses for verification tokens."""

    def __init__(self, strict_mode: bool = True):
        self.strict_mode = strict_mode

    def extract_token(self, memory_image: str) -> Optional[str]:
        """Extract verification token from memory image."""
        match = re.search(r'\[Verify:\s*([^\]]+)\]', memory_image)
        return match.group(1).strip() if match else None

    def check_response(self, response: str, expected_token: str) -> Tuple[bool, str]:
        """
        Check if response contains the expected verification token.

        Returns: (is_valid, explanation)
        """
        if not expected_token:
            return True, "No verification token required"

        response_lower = response.lower()
        token_lower = expected_token.lower()

        if token_lower in response_lower:
            return True, f"Verification token '{expected_token}' found"

        return False, f"HALLUCINATION SUSPECTED: Token '{expected_token}' not found"

Red Queen Protocol

from enum import Enum
from dataclasses import dataclass
from typing import List

class Strategy(Enum):
    RANDOM = "random"
    WEAK_SPOTS = "weak-spots"
    DEPTH_FIRST = "depth-first"
    ADVERSARIAL = "adversarial"

@dataclass
class Question:
    memory_id: str
    question_text: str
    difficulty: str
    expected_elements: List[str]

@dataclass
class Evaluation:
    memory_id: str
    score: float
    gaps: List[str]
    should_evolve: bool

async def run_red_queen(
    palace: dict,
    strategy: Strategy = Strategy.WEAK_SPOTS,
    question_count: int = 10
) -> List[Evaluation]:
    """
    Run adversarial testing protocol.

    1. Examiner generates questions
    2. Learner attempts blind recall
    3. Evaluator scores and identifies gaps
    4. Evolver strengthens weak memories
    """
    memories = select_memories(palace, strategy, question_count)
    questions = await generate_questions(memories, strategy)
    answers = await attempt_recall(questions, anchors_only=True)
    evaluations = await evaluate_answers(questions, answers, ground_truth=memories)

    weak_memories = [e for e in evaluations if e.should_evolve]
    if weak_memories:
        await strengthen_memories(weak_memories, palace)

    return evaluations

Running Benchmarks

# Setup environment
cd paper/code
python -m venv .venv
source .venv/bin/activate
pip install numpy pandas matplotlib requests

# Run local Ollama benchmark
python ollama_benchmark.py

# Run cloud Gemini benchmark
export GEMINI_API_KEY=your_key_here
python gemini_benchmark.py

# Generate visualizations
python visualize_results.py

# Run SOTA comparison
python sota_comparison.py

All benchmark code and results are available in the paper/code/ and paper/results/ directories.

### Appendix B: Implementation Details {#sec-appendix-code} This appendix provides key implementation details for reproducing the Memory Palace system. #### Confidence Scoring Implementation ```python import numpy as np from dataclasses import dataclass from typing import List @dataclass class Memory: id: str subject: str image: str content: str verify_token: str smashin_score: int # 0-12 embedding: List[float] def calculate_retrieval_score( memory: Memory, query: str, query_embedding: List[float], response: str, alpha: float = 0.5, beta: float = 0.3, gamma: float = 0.2 ) -> float: """ Calculate retrieval confidence score. score = α * sim(m, q) + β * verify(m) + γ * smashin(m) where: - sim(m, q) = cosine similarity between memory and query embeddings - verify(m) = 1 if verification token present in response, 0 otherwise - smashin(m) = normalized SMASHIN SCOPE score (0-1) """ # Semantic similarity similarity = cosine_similarity(memory.embedding, query_embedding) # Verification token check verify_score = 1.0 if memory.verify_token.lower() in response.lower() else 0.0 # SMASHIN SCOPE encoding quality smashin_normalized = memory.smashin_score / 12.0 return alpha * similarity + beta * verify_score + gamma * smashin_normalized def cosine_similarity(a: List[float], b: List[float]) -> float: """Calculate cosine similarity between two vectors.""" a, b = np.array(a), np.array(b) return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) ``` #### Hierarchical Index Implementation ```python import json from typing import Dict, List, Optional, Tuple class HierarchicalIndex: """Three-level hierarchical memory index.""" def __init__(self, root_path: str): self.root_path = root_path self.root_index: Dict[str, str] = {} # keyword -> domain self.domain_indices: Dict[str, Dict] = {} # domain -> {topic: location} self._load_indices() def retrieve(self, query: str, k: int = 3) -> Tuple[List[dict], int]: """ 2-hop retrieval: keyword -> domain -> memories Returns: (memories, context_size_bytes) """ context_size = 0 # Hop 1: Find domain from keywords keywords = self._extract_keywords(query) domain = None for kw in keywords: if kw.lower() in self.root_index: domain = self.root_index[kw.lower()] break if not domain: domain = self._semantic_domain_match(query) context_size += len(str(self.root_index)) # Hop 2: Find memories within domain domain_index = self.domain_indices.get(domain, {}) context_size += len(str(domain_index)) # Find top-k matching memories candidates = [] for topic, location in domain_index.items(): score = self._score_match(query, topic) candidates.append((score, topic, location)) candidates.sort(reverse=True) top_k = candidates[:k] # Load actual memories memories = [] for score, topic, location in top_k: memory = self._load_memory(location) memories.append(memory) context_size += len(str(memory)) return memories, context_size ``` #### Verification Token Checker ```python import re from typing import Tuple, Optional class VerificationChecker: """Check LLM responses for verification tokens.""" def __init__(self, strict_mode: bool = True): self.strict_mode = strict_mode def extract_token(self, memory_image: str) -> Optional[str]: """Extract verification token from memory image.""" match = re.search(r'\[Verify:\s*([^\]]+)\]', memory_image) return match.group(1).strip() if match else None def check_response(self, response: str, expected_token: str) -> Tuple[bool, str]: """ Check if response contains the expected verification token. Returns: (is_valid, explanation) """ if not expected_token: return True, "No verification token required" response_lower = response.lower() token_lower = expected_token.lower() if token_lower in response_lower: return True, f"Verification token '{expected_token}' found" return False, f"HALLUCINATION SUSPECTED: Token '{expected_token}' not found" ``` #### Red Queen Protocol ```python from enum import Enum from dataclasses import dataclass from typing import List class Strategy(Enum): RANDOM = "random" WEAK_SPOTS = "weak-spots" DEPTH_FIRST = "depth-first" ADVERSARIAL = "adversarial" @dataclass class Question: memory_id: str question_text: str difficulty: str expected_elements: List[str] @dataclass class Evaluation: memory_id: str score: float gaps: List[str] should_evolve: bool async def run_red_queen( palace: dict, strategy: Strategy = Strategy.WEAK_SPOTS, question_count: int = 10 ) -> List[Evaluation]: """ Run adversarial testing protocol. 1. Examiner generates questions 2. Learner attempts blind recall 3. Evaluator scores and identifies gaps 4. Evolver strengthens weak memories """ memories = select_memories(palace, strategy, question_count) questions = await generate_questions(memories, strategy) answers = await attempt_recall(questions, anchors_only=True) evaluations = await evaluate_answers(questions, answers, ground_truth=memories) weak_memories = [e for e in evaluations if e.should_evolve] if weak_memories: await strengthen_memories(weak_memories, palace) return evaluations ``` #### Running Benchmarks ```bash # Setup environment cd paper/code python -m venv .venv source .venv/bin/activate pip install numpy pandas matplotlib requests # Run local Ollama benchmark python ollama_benchmark.py # Run cloud Gemini benchmark export GEMINI_API_KEY=your_key_here python gemini_benchmark.py # Generate visualizations python visualize_results.py # Run SOTA comparison python sota_comparison.py ``` All benchmark code and results are available in the `paper/code/` and `paper/results/` directories.