move note search

2026-04-03 15:44:25 -07:00
parent f410df3e7a
commit acc42c4381
14 changed files with 10 additions and 6 deletions
--- a/skills/notesearch/README.md
+++ b/skills/notesearch/README.md
@@ -0,0 +1,76 @@
+# notesearch
+
+Local vector search over markdown notes using LlamaIndex + Ollama.
+
+Point it at an Obsidian vault (or any folder of `.md` files), build a vector index, and search by meaning — not just keywords.
+
+## Setup
+
+```bash
+cd ~/.openclaw/workspace/skills/notesearch
+uv sync
+```
+
+Requires Ollama running locally with an embedding model pulled:
+
+```bash
+ollama pull qwen3-embedding:0.6b
+```
+
+## Usage
+
+### Build the index
+
+```bash
+./notesearch.sh index --vault /path/to/vault
+```
+
+### Search
+
+```bash
+./notesearch.sh search "where do I get my allergy shots"
+```
+
+Output:
+
+```
+[0.87] Health/allergy.md
+Started allergy shots in March 2026. Clinic is at 123 Main St.
+
+[0.72] Daily/2026-03-25.md
+Went to allergy appointment today.
+```
+
+### Configuration
+
+Edit `config.json`:
+
+```json
+{
+  "vault": "/home/lyx/Documents/obsidian-yanxin",
+  "index_dir": null,
+  "ollama_url": "http://localhost:11434",
+  "embedding_model": "qwen3-embedding:0.6b"
+}
+```
+
+Values can also be set via flags or env vars. Priority: **flag > env var > config.json > fallback**.
+
+| Flag | Env var | Config key | Default |
+|------|---------|------------|---------|
+| `--vault` | `NOTESEARCH_VAULT` | `vault` | `/home/lyx/Documents/obsidian-yanxin` |
+| `--index-dir` | `NOTESEARCH_INDEX_DIR` | `index_dir` | `<vault>/.index/` |
+| `--ollama-url` | `NOTESEARCH_OLLAMA_URL` | `ollama_url` | `http://localhost:11434` |
+| `--embedding-model` | `NOTESEARCH_EMBEDDING_MODEL` | `embedding_model` | `qwen3-embedding:0.6b` |
+| `--top-k` | — | — | `5` |
+
+## Tests
+
+```bash
+uv run pytest
+```
+
+## How it works
+
+1. **Index**: reads all `.md` files, splits on markdown headings, embeds each chunk via Ollama, stores vectors locally
+2. **Search**: embeds your query, finds the most similar chunks, returns them with file paths and relevance scores
--- a/skills/notesearch/_meta.json
+++ b/skills/notesearch/_meta.json
@@ -0,0 +1,4 @@
+{
+  "slug": "notesearch",
+  "version": "0.1.0"
+}
--- a/skills/notesearch/config.json
+++ b/skills/notesearch/config.json
@@ -0,0 +1,6 @@
+{
+  "vault": "/home/lyx/Documents/obsidian-yanxin",
+  "index_dir": null,
+  "ollama_url": "http://localhost:11434",
+  "embedding_model": "qwen3-embedding:0.6b"
+}
--- a/skills/notesearch/notesearch.sh
+++ b/skills/notesearch/notesearch.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+cd "$SCRIPT_DIR"
+
+exec uv run python -m notesearch "$@"
--- a/skills/notesearch/notesearch/init.py
+++ b/skills/notesearch/notesearch/init.py
--- a/skills/notesearch/notesearch/main.py
+++ b/skills/notesearch/notesearch/main.py
@@ -0,0 +1,5 @@
+"""Allow running as `python -m notesearch`."""
+
+from notesearch.cli import main
+
+main()
--- a/skills/notesearch/notesearch/cli.py
+++ b/skills/notesearch/notesearch/cli.py
@@ -0,0 +1,89 @@
+"""CLI entry point for notesearch."""
+
+import argparse
+import os
+import sys
+
+from notesearch.core import (
+    FALLBACK_EMBEDDING_MODEL,
+    FALLBACK_OLLAMA_URL,
+    FALLBACK_VAULT,
+    build_index,
+    get_config_value,
+    search,
+)
+
+
+def _resolve(flag_value: str | None, env_name: str, config_key: str, fallback: str) -> str:
+    """Resolve a value with priority: flag > env var > config.json > fallback."""
+    if flag_value:
+        return flag_value
+    env = os.environ.get(env_name)
+    if env:
+        return env
+    return get_config_value(config_key, fallback)
+
+
+def cmd_index(args: argparse.Namespace) -> None:
+    vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
+    index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
+    ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
+    model = _resolve(args.model, "NOTESEARCH_EMBEDDING_MODEL", "embedding_model", FALLBACK_EMBEDDING_MODEL)
+
+    print(f"Indexing vault: {vault}")
+    print(f"Model: {model}")
+    idx_path = build_index(vault, index_dir, ollama_url, model)
+    print(f"Index saved to: {idx_path}")
+
+
+def cmd_search(args: argparse.Namespace) -> None:
+    vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
+    index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
+    ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
+
+    results = search(args.query, vault, index_dir, ollama_url, args.top_k)
+
+    if not results:
+        print("No results found.")
+        return
+
+    for r in results:
+        print(f"[{r['score']:.2f}] {r['file']}")
+        print(r["text"])
+        print()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="notesearch",
+        description="Local vector search over markdown notes",
+    )
+    parser.add_argument("--vault", help="Path to the Obsidian vault")
+    parser.add_argument("--index-dir", help="Path to store/load the index")
+    parser.add_argument("--ollama-url", help="Ollama API URL")
+
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # index
+    idx_parser = subparsers.add_parser("index", help="Build the search index")
+    idx_parser.add_argument("--embedding-model", dest="model", help="Ollama embedding model name")
+
+    # search
+    search_parser = subparsers.add_parser("search", help="Search the notes")
+    search_parser.add_argument("query", help="Search query")
+    search_parser.add_argument("--top-k", type=int, default=5, help="Number of results")
+
+    args = parser.parse_args()
+
+    try:
+        if args.command == "index":
+            cmd_index(args)
+        elif args.command == "search":
+            cmd_search(args)
+    except (FileNotFoundError, ValueError) as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/notesearch/notesearch/core.py
+++ b/skills/notesearch/notesearch/core.py
@@ -0,0 +1,124 @@
+"""Core indexing and search logic."""
+
+import json
+from pathlib import Path
+
+from llama_index.core import (
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    load_index_from_storage,
+)
+from llama_index.core.node_parser import MarkdownNodeParser
+from llama_index.embeddings.ollama import OllamaEmbedding
+
+
+FALLBACK_VAULT = "/home/lyx/Documents/obsidian-yanxin"
+FALLBACK_EMBEDDING_MODEL = "qwen3-embedding:0.6b"
+FALLBACK_OLLAMA_URL = "http://localhost:11434"
+METADATA_FILE = "notesearch_meta.json"
+CONFIG_FILE = Path(__file__).parent.parent / "config.json"
+
+
+def load_config() -> dict:
+    """Load config from config.json. Returns empty dict if not found."""
+    if CONFIG_FILE.exists():
+        return json.loads(CONFIG_FILE.read_text())
+    return {}
+
+
+def get_config_value(key: str, fallback: str) -> str:
+    """Get a config value from config.json, with a hardcoded fallback."""
+    config = load_config()
+    return config.get(key) or fallback
+
+
+def _get_index_dir(vault_path: str, index_dir: str | None) -> Path:
+    if index_dir:
+        return Path(index_dir)
+    return Path(vault_path) / ".index"
+
+
+def _get_embed_model(ollama_url: str, model: str) -> OllamaEmbedding:
+    return OllamaEmbedding(model_name=model, base_url=ollama_url)
+
+
+def build_index(
+    vault_path: str = FALLBACK_VAULT,
+    index_dir: str | None = None,
+    ollama_url: str = FALLBACK_OLLAMA_URL,
+    model: str = FALLBACK_EMBEDDING_MODEL,
+) -> Path:
+    """Build a vector index from markdown files in the vault."""
+    vault = Path(vault_path)
+    if not vault.is_dir():
+        raise FileNotFoundError(f"Vault not found: {vault_path}")
+
+    idx_path = _get_index_dir(vault_path, index_dir)
+    idx_path.mkdir(parents=True, exist_ok=True)
+
+    # Check for markdown files before loading (SimpleDirectoryReader raises
+    # its own error on empty dirs, but we want a clearer message)
+    md_files = list(vault.rglob("*.md"))
+    if not md_files:
+        raise ValueError(f"No markdown files found in {vault_path}")
+
+    documents = SimpleDirectoryReader(
+        str(vault),
+        recursive=True,
+        required_exts=[".md"],
+    ).load_data()
+
+    embed_model = _get_embed_model(ollama_url, model)
+    parser = MarkdownNodeParser()
+    nodes = parser.get_nodes_from_documents(documents)
+
+    index = VectorStoreIndex(nodes, embed_model=embed_model)
+    index.storage_context.persist(persist_dir=str(idx_path))
+
+    # Save metadata so we can detect model mismatches
+    meta = {"model": model, "ollama_url": ollama_url, "vault_path": vault_path}
+    (idx_path / METADATA_FILE).write_text(json.dumps(meta, indent=2))
+
+    return idx_path
+
+
+def search(
+    query: str,
+    vault_path: str = FALLBACK_VAULT,
+    index_dir: str | None = None,
+    ollama_url: str = FALLBACK_OLLAMA_URL,
+    top_k: int = 5,
+) -> list[dict]:
+    """Search the index and return matching chunks."""
+    idx_path = _get_index_dir(vault_path, index_dir)
+
+    if not idx_path.exists():
+        raise FileNotFoundError(
+            f"Index not found at {idx_path}. Run 'notesearch index' first."
+        )
+
+    # Load metadata and check model
+    meta_file = idx_path / METADATA_FILE
+    if meta_file.exists():
+        meta = json.loads(meta_file.read_text())
+        model = meta.get("model", FALLBACK_EMBEDDING_MODEL)
+    else:
+        model = FALLBACK_EMBEDDING_MODEL
+
+    embed_model = _get_embed_model(ollama_url, model)
+
+    storage_context = StorageContext.from_defaults(persist_dir=str(idx_path))
+    index = load_index_from_storage(storage_context, embed_model=embed_model)
+
+    retriever = index.as_retriever(similarity_top_k=top_k)
+    results = retriever.retrieve(query)
+
+    return [
+        {
+            "score": round(r.score, 4),
+            "file": r.node.metadata.get("file_path", "unknown"),
+            "text": r.node.text,
+        }
+        for r in results
+    ]
--- a/skills/notesearch/pyproject.toml
+++ b/skills/notesearch/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "notesearch"
+version = "0.1.0"
+description = "Local vector search over markdown notes using LlamaIndex + Ollama"
+requires-python = ">=3.11"
+dependencies = [
+    "llama-index",
+    "llama-index-embeddings-ollama",
+]
+
+[project.scripts]
+notesearch = "notesearch.cli:main"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[dependency-groups]
+dev = ["pytest"]
--- a/skills/notesearch/tests/init.py
+++ b/skills/notesearch/tests/init.py
--- a/skills/notesearch/tests/test_core.py
+++ b/skills/notesearch/tests/test_core.py
@@ -0,0 +1,152 @@
+"""Tests for notesearch core functionality."""
+
+import hashlib
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from notesearch.core import FALLBACK_EMBEDDING_MODEL, METADATA_FILE, build_index, search
+
+
+class FakeEmbedding(BaseEmbedding):
+    """Deterministic embedding model for testing."""
+
+    model_name: str = "test-model"
+
+    def _get_text_embedding(self, text: str) -> list[float]:
+        h = hashlib.md5(text.encode()).digest()
+        return [b / 255.0 for b in h] * 48  # 768-dim
+
+    def _get_query_embedding(self, query: str) -> list[float]:
+        return self._get_text_embedding(query)
+
+    async def _aget_text_embedding(self, text: str) -> list[float]:
+        return self._get_text_embedding(text)
+
+    async def _aget_query_embedding(self, query: str) -> list[float]:
+        return self._get_text_embedding(query)
+
+
+def _mock_embed_model(*args: Any, **kwargs: Any) -> FakeEmbedding:
+    return FakeEmbedding()
+
+
+@pytest.fixture
+def sample_vault(tmp_path: Path) -> Path:
+    """Create a temporary vault with sample markdown files."""
+    vault = tmp_path / "vault"
+    vault.mkdir()
+
+    (vault / "health").mkdir()
+    (vault / "health" / "allergy.md").write_text(
+        "# Allergy Treatment\n\n"
+        "Started allergy shots in March 2026.\n"
+        "Weekly schedule: Tuesday and Thursday.\n"
+        "Clinic is at 123 Main St.\n"
+    )
+
+    (vault / "work").mkdir()
+    (vault / "work" / "project-alpha.md").write_text(
+        "# Project Alpha\n\n"
+        "## Goals\n"
+        "Launch the new API by Q2.\n"
+        "Migrate all users to v2 endpoints.\n\n"
+        "## Status\n"
+        "Backend is 80% done. Frontend blocked on design review.\n"
+    )
+
+    (vault / "recipes.md").write_text(
+        "# Favorite Recipes\n\n"
+        "## Pasta Carbonara\n"
+        "Eggs, pecorino, guanciale, black pepper.\n"
+        "Cook pasta al dente, mix off heat.\n"
+    )
+
+    return vault
+
+
+@pytest.fixture
+def empty_vault(tmp_path: Path) -> Path:
+    """Create an empty vault directory."""
+    vault = tmp_path / "empty_vault"
+    vault.mkdir()
+    return vault
+
+
+class TestBuildIndex:
+    def test_missing_vault(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError, match="Vault not found"):
+            build_index(vault_path=str(tmp_path / "nonexistent"))
+
+    def test_empty_vault(self, empty_vault: Path) -> None:
+        with pytest.raises(ValueError, match="No markdown files found"):
+            build_index(vault_path=str(empty_vault))
+
+    @patch("notesearch.core._get_embed_model", _mock_embed_model)
+    def test_builds_index(self, sample_vault: Path, tmp_path: Path) -> None:
+        index_dir = tmp_path / "index"
+        idx_path = build_index(
+            vault_path=str(sample_vault),
+            index_dir=str(index_dir),
+        )
+
+        assert idx_path == index_dir
+        assert idx_path.exists()
+        assert (idx_path / METADATA_FILE).exists()
+
+        meta = json.loads((idx_path / METADATA_FILE).read_text())
+        assert meta["vault_path"] == str(sample_vault)
+        assert "model" in meta
+
+    @patch("notesearch.core._get_embed_model", _mock_embed_model)
+    def test_index_stores_model_metadata(self, sample_vault: Path, tmp_path: Path) -> None:
+        index_dir = tmp_path / "index"
+        build_index(
+            vault_path=str(sample_vault),
+            index_dir=str(index_dir),
+            model="custom-model",
+        )
+
+        meta = json.loads((index_dir / METADATA_FILE).read_text())
+        assert meta["model"] == "custom-model"
+
+
+class TestSearch:
+    def test_missing_index(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError, match="Index not found"):
+            search("test query", vault_path=str(tmp_path))
+
+    @patch("notesearch.core._get_embed_model", _mock_embed_model)
+    def test_search_returns_results(self, sample_vault: Path, tmp_path: Path) -> None:
+        index_dir = tmp_path / "index"
+        build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
+
+        results = search(
+            "allergy shots",
+            vault_path=str(sample_vault),
+            index_dir=str(index_dir),
+            top_k=3,
+        )
+
+        assert len(results) > 0
+        assert all("score" in r for r in results)
+        assert all("file" in r for r in results)
+        assert all("text" in r for r in results)
+
+    @patch("notesearch.core._get_embed_model", _mock_embed_model)
+    def test_search_respects_top_k(self, sample_vault: Path, tmp_path: Path) -> None:
+        index_dir = tmp_path / "index"
+        build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
+
+        results = search(
+            "anything",
+            vault_path=str(sample_vault),
+            index_dir=str(index_dir),
+            top_k=1,
+        )
+
+        assert len(results) == 1
--- a/skills/notesearch/uv.lock
+++ b/skills/notesearch/uv.lock