move note search

This commit is contained in:
Yanxin Lu
2026-04-03 15:44:25 -07:00
parent f410df3e7a
commit acc42c4381
14 changed files with 10 additions and 6 deletions

View File

@@ -0,0 +1,76 @@
# notesearch
Local vector search over markdown notes using LlamaIndex + Ollama.
Point it at an Obsidian vault (or any folder of `.md` files), build a vector index, and search by meaning — not just keywords.
## Setup
```bash
cd ~/.openclaw/workspace/skills/notesearch
uv sync
```
Requires Ollama running locally with an embedding model pulled:
```bash
ollama pull qwen3-embedding:0.6b
```
## Usage
### Build the index
```bash
./notesearch.sh index --vault /path/to/vault
```
### Search
```bash
./notesearch.sh search "where do I get my allergy shots"
```
Output:
```
[0.87] Health/allergy.md
Started allergy shots in March 2026. Clinic is at 123 Main St.
[0.72] Daily/2026-03-25.md
Went to allergy appointment today.
```
### Configuration
Edit `config.json`:
```json
{
"vault": "/home/lyx/Documents/obsidian-yanxin",
"index_dir": null,
"ollama_url": "http://localhost:11434",
"embedding_model": "qwen3-embedding:0.6b"
}
```
Values can also be set via flags or env vars. Priority: **flag > env var > config.json > fallback**.
| Flag | Env var | Config key | Default |
|------|---------|------------|---------|
| `--vault` | `NOTESEARCH_VAULT` | `vault` | `/home/lyx/Documents/obsidian-yanxin` |
| `--index-dir` | `NOTESEARCH_INDEX_DIR` | `index_dir` | `<vault>/.index/` |
| `--ollama-url` | `NOTESEARCH_OLLAMA_URL` | `ollama_url` | `http://localhost:11434` |
| `--embedding-model` | `NOTESEARCH_EMBEDDING_MODEL` | `embedding_model` | `qwen3-embedding:0.6b` |
| `--top-k` | — | — | `5` |
## Tests
```bash
uv run pytest
```
## How it works
1. **Index**: reads all `.md` files, splits on markdown headings, embeds each chunk via Ollama, stores vectors locally
2. **Search**: embeds your query, finds the most similar chunks, returns them with file paths and relevance scores

View File

@@ -0,0 +1,4 @@
{
"slug": "notesearch",
"version": "0.1.0"
}

View File

@@ -0,0 +1,6 @@
{
"vault": "/home/lyx/Documents/obsidian-yanxin",
"index_dir": null,
"ollama_url": "http://localhost:11434",
"embedding_model": "qwen3-embedding:0.6b"
}

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
cd "$SCRIPT_DIR"
exec uv run python -m notesearch "$@"

View File

View File

@@ -0,0 +1,5 @@
"""Allow running as `python -m notesearch`."""
from notesearch.cli import main
main()

View File

@@ -0,0 +1,89 @@
"""CLI entry point for notesearch."""
import argparse
import os
import sys
from notesearch.core import (
FALLBACK_EMBEDDING_MODEL,
FALLBACK_OLLAMA_URL,
FALLBACK_VAULT,
build_index,
get_config_value,
search,
)
def _resolve(flag_value: str | None, env_name: str, config_key: str, fallback: str) -> str:
"""Resolve a value with priority: flag > env var > config.json > fallback."""
if flag_value:
return flag_value
env = os.environ.get(env_name)
if env:
return env
return get_config_value(config_key, fallback)
def cmd_index(args: argparse.Namespace) -> None:
vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
model = _resolve(args.model, "NOTESEARCH_EMBEDDING_MODEL", "embedding_model", FALLBACK_EMBEDDING_MODEL)
print(f"Indexing vault: {vault}")
print(f"Model: {model}")
idx_path = build_index(vault, index_dir, ollama_url, model)
print(f"Index saved to: {idx_path}")
def cmd_search(args: argparse.Namespace) -> None:
vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
results = search(args.query, vault, index_dir, ollama_url, args.top_k)
if not results:
print("No results found.")
return
for r in results:
print(f"[{r['score']:.2f}] {r['file']}")
print(r["text"])
print()
def main() -> None:
parser = argparse.ArgumentParser(
prog="notesearch",
description="Local vector search over markdown notes",
)
parser.add_argument("--vault", help="Path to the Obsidian vault")
parser.add_argument("--index-dir", help="Path to store/load the index")
parser.add_argument("--ollama-url", help="Ollama API URL")
subparsers = parser.add_subparsers(dest="command", required=True)
# index
idx_parser = subparsers.add_parser("index", help="Build the search index")
idx_parser.add_argument("--embedding-model", dest="model", help="Ollama embedding model name")
# search
search_parser = subparsers.add_parser("search", help="Search the notes")
search_parser.add_argument("query", help="Search query")
search_parser.add_argument("--top-k", type=int, default=5, help="Number of results")
args = parser.parse_args()
try:
if args.command == "index":
cmd_index(args)
elif args.command == "search":
cmd_search(args)
except (FileNotFoundError, ValueError) as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,124 @@
"""Core indexing and search logic."""
import json
from pathlib import Path
from llama_index.core import (
SimpleDirectoryReader,
StorageContext,
VectorStoreIndex,
load_index_from_storage,
)
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding
FALLBACK_VAULT = "/home/lyx/Documents/obsidian-yanxin"
FALLBACK_EMBEDDING_MODEL = "qwen3-embedding:0.6b"
FALLBACK_OLLAMA_URL = "http://localhost:11434"
METADATA_FILE = "notesearch_meta.json"
CONFIG_FILE = Path(__file__).parent.parent / "config.json"
def load_config() -> dict:
"""Load config from config.json. Returns empty dict if not found."""
if CONFIG_FILE.exists():
return json.loads(CONFIG_FILE.read_text())
return {}
def get_config_value(key: str, fallback: str) -> str:
"""Get a config value from config.json, with a hardcoded fallback."""
config = load_config()
return config.get(key) or fallback
def _get_index_dir(vault_path: str, index_dir: str | None) -> Path:
if index_dir:
return Path(index_dir)
return Path(vault_path) / ".index"
def _get_embed_model(ollama_url: str, model: str) -> OllamaEmbedding:
return OllamaEmbedding(model_name=model, base_url=ollama_url)
def build_index(
vault_path: str = FALLBACK_VAULT,
index_dir: str | None = None,
ollama_url: str = FALLBACK_OLLAMA_URL,
model: str = FALLBACK_EMBEDDING_MODEL,
) -> Path:
"""Build a vector index from markdown files in the vault."""
vault = Path(vault_path)
if not vault.is_dir():
raise FileNotFoundError(f"Vault not found: {vault_path}")
idx_path = _get_index_dir(vault_path, index_dir)
idx_path.mkdir(parents=True, exist_ok=True)
# Check for markdown files before loading (SimpleDirectoryReader raises
# its own error on empty dirs, but we want a clearer message)
md_files = list(vault.rglob("*.md"))
if not md_files:
raise ValueError(f"No markdown files found in {vault_path}")
documents = SimpleDirectoryReader(
str(vault),
recursive=True,
required_exts=[".md"],
).load_data()
embed_model = _get_embed_model(ollama_url, model)
parser = MarkdownNodeParser()
nodes = parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, embed_model=embed_model)
index.storage_context.persist(persist_dir=str(idx_path))
# Save metadata so we can detect model mismatches
meta = {"model": model, "ollama_url": ollama_url, "vault_path": vault_path}
(idx_path / METADATA_FILE).write_text(json.dumps(meta, indent=2))
return idx_path
def search(
query: str,
vault_path: str = FALLBACK_VAULT,
index_dir: str | None = None,
ollama_url: str = FALLBACK_OLLAMA_URL,
top_k: int = 5,
) -> list[dict]:
"""Search the index and return matching chunks."""
idx_path = _get_index_dir(vault_path, index_dir)
if not idx_path.exists():
raise FileNotFoundError(
f"Index not found at {idx_path}. Run 'notesearch index' first."
)
# Load metadata and check model
meta_file = idx_path / METADATA_FILE
if meta_file.exists():
meta = json.loads(meta_file.read_text())
model = meta.get("model", FALLBACK_EMBEDDING_MODEL)
else:
model = FALLBACK_EMBEDDING_MODEL
embed_model = _get_embed_model(ollama_url, model)
storage_context = StorageContext.from_defaults(persist_dir=str(idx_path))
index = load_index_from_storage(storage_context, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=top_k)
results = retriever.retrieve(query)
return [
{
"score": round(r.score, 4),
"file": r.node.metadata.get("file_path", "unknown"),
"text": r.node.text,
}
for r in results
]

View File

@@ -0,0 +1,18 @@
[project]
name = "notesearch"
version = "0.1.0"
description = "Local vector search over markdown notes using LlamaIndex + Ollama"
requires-python = ">=3.11"
dependencies = [
"llama-index",
"llama-index-embeddings-ollama",
]
[project.scripts]
notesearch = "notesearch.cli:main"
[tool.pytest.ini_options]
testpaths = ["tests"]
[dependency-groups]
dev = ["pytest"]

View File

View File

@@ -0,0 +1,152 @@
"""Tests for notesearch core functionality."""
import hashlib
import json
from pathlib import Path
from typing import Any
from unittest.mock import patch
import pytest
from llama_index.core.base.embeddings.base import BaseEmbedding
from notesearch.core import FALLBACK_EMBEDDING_MODEL, METADATA_FILE, build_index, search
class FakeEmbedding(BaseEmbedding):
"""Deterministic embedding model for testing."""
model_name: str = "test-model"
def _get_text_embedding(self, text: str) -> list[float]:
h = hashlib.md5(text.encode()).digest()
return [b / 255.0 for b in h] * 48 # 768-dim
def _get_query_embedding(self, query: str) -> list[float]:
return self._get_text_embedding(query)
async def _aget_text_embedding(self, text: str) -> list[float]:
return self._get_text_embedding(text)
async def _aget_query_embedding(self, query: str) -> list[float]:
return self._get_text_embedding(query)
def _mock_embed_model(*args: Any, **kwargs: Any) -> FakeEmbedding:
return FakeEmbedding()
@pytest.fixture
def sample_vault(tmp_path: Path) -> Path:
"""Create a temporary vault with sample markdown files."""
vault = tmp_path / "vault"
vault.mkdir()
(vault / "health").mkdir()
(vault / "health" / "allergy.md").write_text(
"# Allergy Treatment\n\n"
"Started allergy shots in March 2026.\n"
"Weekly schedule: Tuesday and Thursday.\n"
"Clinic is at 123 Main St.\n"
)
(vault / "work").mkdir()
(vault / "work" / "project-alpha.md").write_text(
"# Project Alpha\n\n"
"## Goals\n"
"Launch the new API by Q2.\n"
"Migrate all users to v2 endpoints.\n\n"
"## Status\n"
"Backend is 80% done. Frontend blocked on design review.\n"
)
(vault / "recipes.md").write_text(
"# Favorite Recipes\n\n"
"## Pasta Carbonara\n"
"Eggs, pecorino, guanciale, black pepper.\n"
"Cook pasta al dente, mix off heat.\n"
)
return vault
@pytest.fixture
def empty_vault(tmp_path: Path) -> Path:
"""Create an empty vault directory."""
vault = tmp_path / "empty_vault"
vault.mkdir()
return vault
class TestBuildIndex:
def test_missing_vault(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError, match="Vault not found"):
build_index(vault_path=str(tmp_path / "nonexistent"))
def test_empty_vault(self, empty_vault: Path) -> None:
with pytest.raises(ValueError, match="No markdown files found"):
build_index(vault_path=str(empty_vault))
@patch("notesearch.core._get_embed_model", _mock_embed_model)
def test_builds_index(self, sample_vault: Path, tmp_path: Path) -> None:
index_dir = tmp_path / "index"
idx_path = build_index(
vault_path=str(sample_vault),
index_dir=str(index_dir),
)
assert idx_path == index_dir
assert idx_path.exists()
assert (idx_path / METADATA_FILE).exists()
meta = json.loads((idx_path / METADATA_FILE).read_text())
assert meta["vault_path"] == str(sample_vault)
assert "model" in meta
@patch("notesearch.core._get_embed_model", _mock_embed_model)
def test_index_stores_model_metadata(self, sample_vault: Path, tmp_path: Path) -> None:
index_dir = tmp_path / "index"
build_index(
vault_path=str(sample_vault),
index_dir=str(index_dir),
model="custom-model",
)
meta = json.loads((index_dir / METADATA_FILE).read_text())
assert meta["model"] == "custom-model"
class TestSearch:
def test_missing_index(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError, match="Index not found"):
search("test query", vault_path=str(tmp_path))
@patch("notesearch.core._get_embed_model", _mock_embed_model)
def test_search_returns_results(self, sample_vault: Path, tmp_path: Path) -> None:
index_dir = tmp_path / "index"
build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
results = search(
"allergy shots",
vault_path=str(sample_vault),
index_dir=str(index_dir),
top_k=3,
)
assert len(results) > 0
assert all("score" in r for r in results)
assert all("file" in r for r in results)
assert all("text" in r for r in results)
@patch("notesearch.core._get_embed_model", _mock_embed_model)
def test_search_respects_top_k(self, sample_vault: Path, tmp_path: Path) -> None:
index_dir = tmp_path / "index"
build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
results = search(
"anything",
vault_path=str(sample_vault),
index_dir=str(index_dir),
top_k=1,
)
assert len(results) == 1

2154
skills/notesearch/uv.lock generated Normal file

File diff suppressed because it is too large Load Diff