Files
youlu-openclaw-workspace/skills/notesearch/notesearch/core.py
2026-04-03 15:44:25 -07:00

125 lines
3.8 KiB
Python

"""Core indexing and search logic."""
import json
from pathlib import Path
from llama_index.core import (
SimpleDirectoryReader,
StorageContext,
VectorStoreIndex,
load_index_from_storage,
)
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding
FALLBACK_VAULT = "/home/lyx/Documents/obsidian-yanxin"
FALLBACK_EMBEDDING_MODEL = "qwen3-embedding:0.6b"
FALLBACK_OLLAMA_URL = "http://localhost:11434"
METADATA_FILE = "notesearch_meta.json"
CONFIG_FILE = Path(__file__).parent.parent / "config.json"
def load_config() -> dict:
"""Load config from config.json. Returns empty dict if not found."""
if CONFIG_FILE.exists():
return json.loads(CONFIG_FILE.read_text())
return {}
def get_config_value(key: str, fallback: str) -> str:
"""Get a config value from config.json, with a hardcoded fallback."""
config = load_config()
return config.get(key) or fallback
def _get_index_dir(vault_path: str, index_dir: str | None) -> Path:
if index_dir:
return Path(index_dir)
return Path(vault_path) / ".index"
def _get_embed_model(ollama_url: str, model: str) -> OllamaEmbedding:
return OllamaEmbedding(model_name=model, base_url=ollama_url)
def build_index(
vault_path: str = FALLBACK_VAULT,
index_dir: str | None = None,
ollama_url: str = FALLBACK_OLLAMA_URL,
model: str = FALLBACK_EMBEDDING_MODEL,
) -> Path:
"""Build a vector index from markdown files in the vault."""
vault = Path(vault_path)
if not vault.is_dir():
raise FileNotFoundError(f"Vault not found: {vault_path}")
idx_path = _get_index_dir(vault_path, index_dir)
idx_path.mkdir(parents=True, exist_ok=True)
# Check for markdown files before loading (SimpleDirectoryReader raises
# its own error on empty dirs, but we want a clearer message)
md_files = list(vault.rglob("*.md"))
if not md_files:
raise ValueError(f"No markdown files found in {vault_path}")
documents = SimpleDirectoryReader(
str(vault),
recursive=True,
required_exts=[".md"],
).load_data()
embed_model = _get_embed_model(ollama_url, model)
parser = MarkdownNodeParser()
nodes = parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes, embed_model=embed_model)
index.storage_context.persist(persist_dir=str(idx_path))
# Save metadata so we can detect model mismatches
meta = {"model": model, "ollama_url": ollama_url, "vault_path": vault_path}
(idx_path / METADATA_FILE).write_text(json.dumps(meta, indent=2))
return idx_path
def search(
query: str,
vault_path: str = FALLBACK_VAULT,
index_dir: str | None = None,
ollama_url: str = FALLBACK_OLLAMA_URL,
top_k: int = 5,
) -> list[dict]:
"""Search the index and return matching chunks."""
idx_path = _get_index_dir(vault_path, index_dir)
if not idx_path.exists():
raise FileNotFoundError(
f"Index not found at {idx_path}. Run 'notesearch index' first."
)
# Load metadata and check model
meta_file = idx_path / METADATA_FILE
if meta_file.exists():
meta = json.loads(meta_file.read_text())
model = meta.get("model", FALLBACK_EMBEDDING_MODEL)
else:
model = FALLBACK_EMBEDDING_MODEL
embed_model = _get_embed_model(ollama_url, model)
storage_context = StorageContext.from_defaults(persist_dir=str(idx_path))
index = load_index_from_storage(storage_context, embed_model=embed_model)
retriever = index.as_retriever(similarity_top_k=top_k)
results = retriever.retrieve(query)
return [
{
"score": round(r.score, 4),
"file": r.node.metadata.get("file_path", "unknown"),
"text": r.node.text,
}
for r in results
]