move note search
This commit is contained in:
76
skills/notesearch/README.md
Normal file
76
skills/notesearch/README.md
Normal file
@@ -0,0 +1,76 @@
|
||||
# notesearch
|
||||
|
||||
Local vector search over markdown notes using LlamaIndex + Ollama.
|
||||
|
||||
Point it at an Obsidian vault (or any folder of `.md` files), build a vector index, and search by meaning — not just keywords.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
cd ~/.openclaw/workspace/skills/notesearch
|
||||
uv sync
|
||||
```
|
||||
|
||||
Requires Ollama running locally with an embedding model pulled:
|
||||
|
||||
```bash
|
||||
ollama pull qwen3-embedding:0.6b
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Build the index
|
||||
|
||||
```bash
|
||||
./notesearch.sh index --vault /path/to/vault
|
||||
```
|
||||
|
||||
### Search
|
||||
|
||||
```bash
|
||||
./notesearch.sh search "where do I get my allergy shots"
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```
|
||||
[0.87] Health/allergy.md
|
||||
Started allergy shots in March 2026. Clinic is at 123 Main St.
|
||||
|
||||
[0.72] Daily/2026-03-25.md
|
||||
Went to allergy appointment today.
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Edit `config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"vault": "/home/lyx/Documents/obsidian-yanxin",
|
||||
"index_dir": null,
|
||||
"ollama_url": "http://localhost:11434",
|
||||
"embedding_model": "qwen3-embedding:0.6b"
|
||||
}
|
||||
```
|
||||
|
||||
Values can also be set via flags or env vars. Priority: **flag > env var > config.json > fallback**.
|
||||
|
||||
| Flag | Env var | Config key | Default |
|
||||
|------|---------|------------|---------|
|
||||
| `--vault` | `NOTESEARCH_VAULT` | `vault` | `/home/lyx/Documents/obsidian-yanxin` |
|
||||
| `--index-dir` | `NOTESEARCH_INDEX_DIR` | `index_dir` | `<vault>/.index/` |
|
||||
| `--ollama-url` | `NOTESEARCH_OLLAMA_URL` | `ollama_url` | `http://localhost:11434` |
|
||||
| `--embedding-model` | `NOTESEARCH_EMBEDDING_MODEL` | `embedding_model` | `qwen3-embedding:0.6b` |
|
||||
| `--top-k` | — | — | `5` |
|
||||
|
||||
## Tests
|
||||
|
||||
```bash
|
||||
uv run pytest
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
1. **Index**: reads all `.md` files, splits on markdown headings, embeds each chunk via Ollama, stores vectors locally
|
||||
2. **Search**: embeds your query, finds the most similar chunks, returns them with file paths and relevance scores
|
||||
4
skills/notesearch/_meta.json
Normal file
4
skills/notesearch/_meta.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"slug": "notesearch",
|
||||
"version": "0.1.0"
|
||||
}
|
||||
6
skills/notesearch/config.json
Normal file
6
skills/notesearch/config.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"vault": "/home/lyx/Documents/obsidian-yanxin",
|
||||
"index_dir": null,
|
||||
"ollama_url": "http://localhost:11434",
|
||||
"embedding_model": "qwen3-embedding:0.6b"
|
||||
}
|
||||
7
skills/notesearch/notesearch.sh
Executable file
7
skills/notesearch/notesearch.sh
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
exec uv run python -m notesearch "$@"
|
||||
0
skills/notesearch/notesearch/__init__.py
Normal file
0
skills/notesearch/notesearch/__init__.py
Normal file
5
skills/notesearch/notesearch/__main__.py
Normal file
5
skills/notesearch/notesearch/__main__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Allow running as `python -m notesearch`."""
|
||||
|
||||
from notesearch.cli import main
|
||||
|
||||
main()
|
||||
89
skills/notesearch/notesearch/cli.py
Normal file
89
skills/notesearch/notesearch/cli.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""CLI entry point for notesearch."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
from notesearch.core import (
|
||||
FALLBACK_EMBEDDING_MODEL,
|
||||
FALLBACK_OLLAMA_URL,
|
||||
FALLBACK_VAULT,
|
||||
build_index,
|
||||
get_config_value,
|
||||
search,
|
||||
)
|
||||
|
||||
|
||||
def _resolve(flag_value: str | None, env_name: str, config_key: str, fallback: str) -> str:
|
||||
"""Resolve a value with priority: flag > env var > config.json > fallback."""
|
||||
if flag_value:
|
||||
return flag_value
|
||||
env = os.environ.get(env_name)
|
||||
if env:
|
||||
return env
|
||||
return get_config_value(config_key, fallback)
|
||||
|
||||
|
||||
def cmd_index(args: argparse.Namespace) -> None:
|
||||
vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
|
||||
index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
|
||||
ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
|
||||
model = _resolve(args.model, "NOTESEARCH_EMBEDDING_MODEL", "embedding_model", FALLBACK_EMBEDDING_MODEL)
|
||||
|
||||
print(f"Indexing vault: {vault}")
|
||||
print(f"Model: {model}")
|
||||
idx_path = build_index(vault, index_dir, ollama_url, model)
|
||||
print(f"Index saved to: {idx_path}")
|
||||
|
||||
|
||||
def cmd_search(args: argparse.Namespace) -> None:
|
||||
vault = _resolve(args.vault, "NOTESEARCH_VAULT", "vault", FALLBACK_VAULT)
|
||||
index_dir = _resolve(args.index_dir, "NOTESEARCH_INDEX_DIR", "index_dir", "") or None
|
||||
ollama_url = _resolve(args.ollama_url, "NOTESEARCH_OLLAMA_URL", "ollama_url", FALLBACK_OLLAMA_URL)
|
||||
|
||||
results = search(args.query, vault, index_dir, ollama_url, args.top_k)
|
||||
|
||||
if not results:
|
||||
print("No results found.")
|
||||
return
|
||||
|
||||
for r in results:
|
||||
print(f"[{r['score']:.2f}] {r['file']}")
|
||||
print(r["text"])
|
||||
print()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="notesearch",
|
||||
description="Local vector search over markdown notes",
|
||||
)
|
||||
parser.add_argument("--vault", help="Path to the Obsidian vault")
|
||||
parser.add_argument("--index-dir", help="Path to store/load the index")
|
||||
parser.add_argument("--ollama-url", help="Ollama API URL")
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# index
|
||||
idx_parser = subparsers.add_parser("index", help="Build the search index")
|
||||
idx_parser.add_argument("--embedding-model", dest="model", help="Ollama embedding model name")
|
||||
|
||||
# search
|
||||
search_parser = subparsers.add_parser("search", help="Search the notes")
|
||||
search_parser.add_argument("query", help="Search query")
|
||||
search_parser.add_argument("--top-k", type=int, default=5, help="Number of results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
if args.command == "index":
|
||||
cmd_index(args)
|
||||
elif args.command == "search":
|
||||
cmd_search(args)
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
124
skills/notesearch/notesearch/core.py
Normal file
124
skills/notesearch/notesearch/core.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Core indexing and search logic."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from llama_index.core import (
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
VectorStoreIndex,
|
||||
load_index_from_storage,
|
||||
)
|
||||
from llama_index.core.node_parser import MarkdownNodeParser
|
||||
from llama_index.embeddings.ollama import OllamaEmbedding
|
||||
|
||||
|
||||
FALLBACK_VAULT = "/home/lyx/Documents/obsidian-yanxin"
|
||||
FALLBACK_EMBEDDING_MODEL = "qwen3-embedding:0.6b"
|
||||
FALLBACK_OLLAMA_URL = "http://localhost:11434"
|
||||
METADATA_FILE = "notesearch_meta.json"
|
||||
CONFIG_FILE = Path(__file__).parent.parent / "config.json"
|
||||
|
||||
|
||||
def load_config() -> dict:
|
||||
"""Load config from config.json. Returns empty dict if not found."""
|
||||
if CONFIG_FILE.exists():
|
||||
return json.loads(CONFIG_FILE.read_text())
|
||||
return {}
|
||||
|
||||
|
||||
def get_config_value(key: str, fallback: str) -> str:
|
||||
"""Get a config value from config.json, with a hardcoded fallback."""
|
||||
config = load_config()
|
||||
return config.get(key) or fallback
|
||||
|
||||
|
||||
def _get_index_dir(vault_path: str, index_dir: str | None) -> Path:
|
||||
if index_dir:
|
||||
return Path(index_dir)
|
||||
return Path(vault_path) / ".index"
|
||||
|
||||
|
||||
def _get_embed_model(ollama_url: str, model: str) -> OllamaEmbedding:
|
||||
return OllamaEmbedding(model_name=model, base_url=ollama_url)
|
||||
|
||||
|
||||
def build_index(
|
||||
vault_path: str = FALLBACK_VAULT,
|
||||
index_dir: str | None = None,
|
||||
ollama_url: str = FALLBACK_OLLAMA_URL,
|
||||
model: str = FALLBACK_EMBEDDING_MODEL,
|
||||
) -> Path:
|
||||
"""Build a vector index from markdown files in the vault."""
|
||||
vault = Path(vault_path)
|
||||
if not vault.is_dir():
|
||||
raise FileNotFoundError(f"Vault not found: {vault_path}")
|
||||
|
||||
idx_path = _get_index_dir(vault_path, index_dir)
|
||||
idx_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Check for markdown files before loading (SimpleDirectoryReader raises
|
||||
# its own error on empty dirs, but we want a clearer message)
|
||||
md_files = list(vault.rglob("*.md"))
|
||||
if not md_files:
|
||||
raise ValueError(f"No markdown files found in {vault_path}")
|
||||
|
||||
documents = SimpleDirectoryReader(
|
||||
str(vault),
|
||||
recursive=True,
|
||||
required_exts=[".md"],
|
||||
).load_data()
|
||||
|
||||
embed_model = _get_embed_model(ollama_url, model)
|
||||
parser = MarkdownNodeParser()
|
||||
nodes = parser.get_nodes_from_documents(documents)
|
||||
|
||||
index = VectorStoreIndex(nodes, embed_model=embed_model)
|
||||
index.storage_context.persist(persist_dir=str(idx_path))
|
||||
|
||||
# Save metadata so we can detect model mismatches
|
||||
meta = {"model": model, "ollama_url": ollama_url, "vault_path": vault_path}
|
||||
(idx_path / METADATA_FILE).write_text(json.dumps(meta, indent=2))
|
||||
|
||||
return idx_path
|
||||
|
||||
|
||||
def search(
|
||||
query: str,
|
||||
vault_path: str = FALLBACK_VAULT,
|
||||
index_dir: str | None = None,
|
||||
ollama_url: str = FALLBACK_OLLAMA_URL,
|
||||
top_k: int = 5,
|
||||
) -> list[dict]:
|
||||
"""Search the index and return matching chunks."""
|
||||
idx_path = _get_index_dir(vault_path, index_dir)
|
||||
|
||||
if not idx_path.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Index not found at {idx_path}. Run 'notesearch index' first."
|
||||
)
|
||||
|
||||
# Load metadata and check model
|
||||
meta_file = idx_path / METADATA_FILE
|
||||
if meta_file.exists():
|
||||
meta = json.loads(meta_file.read_text())
|
||||
model = meta.get("model", FALLBACK_EMBEDDING_MODEL)
|
||||
else:
|
||||
model = FALLBACK_EMBEDDING_MODEL
|
||||
|
||||
embed_model = _get_embed_model(ollama_url, model)
|
||||
|
||||
storage_context = StorageContext.from_defaults(persist_dir=str(idx_path))
|
||||
index = load_index_from_storage(storage_context, embed_model=embed_model)
|
||||
|
||||
retriever = index.as_retriever(similarity_top_k=top_k)
|
||||
results = retriever.retrieve(query)
|
||||
|
||||
return [
|
||||
{
|
||||
"score": round(r.score, 4),
|
||||
"file": r.node.metadata.get("file_path", "unknown"),
|
||||
"text": r.node.text,
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
18
skills/notesearch/pyproject.toml
Normal file
18
skills/notesearch/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[project]
|
||||
name = "notesearch"
|
||||
version = "0.1.0"
|
||||
description = "Local vector search over markdown notes using LlamaIndex + Ollama"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"llama-index",
|
||||
"llama-index-embeddings-ollama",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
notesearch = "notesearch.cli:main"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = ["pytest"]
|
||||
0
skills/notesearch/tests/__init__.py
Normal file
0
skills/notesearch/tests/__init__.py
Normal file
152
skills/notesearch/tests/test_core.py
Normal file
152
skills/notesearch/tests/test_core.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Tests for notesearch core functionality."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from llama_index.core.base.embeddings.base import BaseEmbedding
|
||||
from notesearch.core import FALLBACK_EMBEDDING_MODEL, METADATA_FILE, build_index, search
|
||||
|
||||
|
||||
class FakeEmbedding(BaseEmbedding):
|
||||
"""Deterministic embedding model for testing."""
|
||||
|
||||
model_name: str = "test-model"
|
||||
|
||||
def _get_text_embedding(self, text: str) -> list[float]:
|
||||
h = hashlib.md5(text.encode()).digest()
|
||||
return [b / 255.0 for b in h] * 48 # 768-dim
|
||||
|
||||
def _get_query_embedding(self, query: str) -> list[float]:
|
||||
return self._get_text_embedding(query)
|
||||
|
||||
async def _aget_text_embedding(self, text: str) -> list[float]:
|
||||
return self._get_text_embedding(text)
|
||||
|
||||
async def _aget_query_embedding(self, query: str) -> list[float]:
|
||||
return self._get_text_embedding(query)
|
||||
|
||||
|
||||
def _mock_embed_model(*args: Any, **kwargs: Any) -> FakeEmbedding:
|
||||
return FakeEmbedding()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_vault(tmp_path: Path) -> Path:
|
||||
"""Create a temporary vault with sample markdown files."""
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
|
||||
(vault / "health").mkdir()
|
||||
(vault / "health" / "allergy.md").write_text(
|
||||
"# Allergy Treatment\n\n"
|
||||
"Started allergy shots in March 2026.\n"
|
||||
"Weekly schedule: Tuesday and Thursday.\n"
|
||||
"Clinic is at 123 Main St.\n"
|
||||
)
|
||||
|
||||
(vault / "work").mkdir()
|
||||
(vault / "work" / "project-alpha.md").write_text(
|
||||
"# Project Alpha\n\n"
|
||||
"## Goals\n"
|
||||
"Launch the new API by Q2.\n"
|
||||
"Migrate all users to v2 endpoints.\n\n"
|
||||
"## Status\n"
|
||||
"Backend is 80% done. Frontend blocked on design review.\n"
|
||||
)
|
||||
|
||||
(vault / "recipes.md").write_text(
|
||||
"# Favorite Recipes\n\n"
|
||||
"## Pasta Carbonara\n"
|
||||
"Eggs, pecorino, guanciale, black pepper.\n"
|
||||
"Cook pasta al dente, mix off heat.\n"
|
||||
)
|
||||
|
||||
return vault
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def empty_vault(tmp_path: Path) -> Path:
|
||||
"""Create an empty vault directory."""
|
||||
vault = tmp_path / "empty_vault"
|
||||
vault.mkdir()
|
||||
return vault
|
||||
|
||||
|
||||
class TestBuildIndex:
|
||||
def test_missing_vault(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError, match="Vault not found"):
|
||||
build_index(vault_path=str(tmp_path / "nonexistent"))
|
||||
|
||||
def test_empty_vault(self, empty_vault: Path) -> None:
|
||||
with pytest.raises(ValueError, match="No markdown files found"):
|
||||
build_index(vault_path=str(empty_vault))
|
||||
|
||||
@patch("notesearch.core._get_embed_model", _mock_embed_model)
|
||||
def test_builds_index(self, sample_vault: Path, tmp_path: Path) -> None:
|
||||
index_dir = tmp_path / "index"
|
||||
idx_path = build_index(
|
||||
vault_path=str(sample_vault),
|
||||
index_dir=str(index_dir),
|
||||
)
|
||||
|
||||
assert idx_path == index_dir
|
||||
assert idx_path.exists()
|
||||
assert (idx_path / METADATA_FILE).exists()
|
||||
|
||||
meta = json.loads((idx_path / METADATA_FILE).read_text())
|
||||
assert meta["vault_path"] == str(sample_vault)
|
||||
assert "model" in meta
|
||||
|
||||
@patch("notesearch.core._get_embed_model", _mock_embed_model)
|
||||
def test_index_stores_model_metadata(self, sample_vault: Path, tmp_path: Path) -> None:
|
||||
index_dir = tmp_path / "index"
|
||||
build_index(
|
||||
vault_path=str(sample_vault),
|
||||
index_dir=str(index_dir),
|
||||
model="custom-model",
|
||||
)
|
||||
|
||||
meta = json.loads((index_dir / METADATA_FILE).read_text())
|
||||
assert meta["model"] == "custom-model"
|
||||
|
||||
|
||||
class TestSearch:
|
||||
def test_missing_index(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError, match="Index not found"):
|
||||
search("test query", vault_path=str(tmp_path))
|
||||
|
||||
@patch("notesearch.core._get_embed_model", _mock_embed_model)
|
||||
def test_search_returns_results(self, sample_vault: Path, tmp_path: Path) -> None:
|
||||
index_dir = tmp_path / "index"
|
||||
build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
|
||||
|
||||
results = search(
|
||||
"allergy shots",
|
||||
vault_path=str(sample_vault),
|
||||
index_dir=str(index_dir),
|
||||
top_k=3,
|
||||
)
|
||||
|
||||
assert len(results) > 0
|
||||
assert all("score" in r for r in results)
|
||||
assert all("file" in r for r in results)
|
||||
assert all("text" in r for r in results)
|
||||
|
||||
@patch("notesearch.core._get_embed_model", _mock_embed_model)
|
||||
def test_search_respects_top_k(self, sample_vault: Path, tmp_path: Path) -> None:
|
||||
index_dir = tmp_path / "index"
|
||||
build_index(vault_path=str(sample_vault), index_dir=str(index_dir))
|
||||
|
||||
results = search(
|
||||
"anything",
|
||||
vault_path=str(sample_vault),
|
||||
index_dir=str(index_dir),
|
||||
top_k=1,
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
2154
skills/notesearch/uv.lock
generated
Normal file
2154
skills/notesearch/uv.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user