125 lines
3.8 KiB
Python
125 lines
3.8 KiB
Python
"""Core indexing and search logic."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from llama_index.core import (
|
|
SimpleDirectoryReader,
|
|
StorageContext,
|
|
VectorStoreIndex,
|
|
load_index_from_storage,
|
|
)
|
|
from llama_index.core.node_parser import MarkdownNodeParser
|
|
from llama_index.embeddings.ollama import OllamaEmbedding
|
|
|
|
|
|
FALLBACK_VAULT = "/home/lyx/Documents/obsidian-yanxin"
|
|
FALLBACK_EMBEDDING_MODEL = "qwen3-embedding:0.6b"
|
|
FALLBACK_OLLAMA_URL = "http://localhost:11434"
|
|
METADATA_FILE = "notesearch_meta.json"
|
|
CONFIG_FILE = Path(__file__).parent.parent / "config.json"
|
|
|
|
|
|
def load_config() -> dict:
|
|
"""Load config from config.json. Returns empty dict if not found."""
|
|
if CONFIG_FILE.exists():
|
|
return json.loads(CONFIG_FILE.read_text())
|
|
return {}
|
|
|
|
|
|
def get_config_value(key: str, fallback: str) -> str:
|
|
"""Get a config value from config.json, with a hardcoded fallback."""
|
|
config = load_config()
|
|
return config.get(key) or fallback
|
|
|
|
|
|
def _get_index_dir(vault_path: str, index_dir: str | None) -> Path:
|
|
if index_dir:
|
|
return Path(index_dir)
|
|
return Path(vault_path) / ".index"
|
|
|
|
|
|
def _get_embed_model(ollama_url: str, model: str) -> OllamaEmbedding:
|
|
return OllamaEmbedding(model_name=model, base_url=ollama_url)
|
|
|
|
|
|
def build_index(
|
|
vault_path: str = FALLBACK_VAULT,
|
|
index_dir: str | None = None,
|
|
ollama_url: str = FALLBACK_OLLAMA_URL,
|
|
model: str = FALLBACK_EMBEDDING_MODEL,
|
|
) -> Path:
|
|
"""Build a vector index from markdown files in the vault."""
|
|
vault = Path(vault_path)
|
|
if not vault.is_dir():
|
|
raise FileNotFoundError(f"Vault not found: {vault_path}")
|
|
|
|
idx_path = _get_index_dir(vault_path, index_dir)
|
|
idx_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Check for markdown files before loading (SimpleDirectoryReader raises
|
|
# its own error on empty dirs, but we want a clearer message)
|
|
md_files = list(vault.rglob("*.md"))
|
|
if not md_files:
|
|
raise ValueError(f"No markdown files found in {vault_path}")
|
|
|
|
documents = SimpleDirectoryReader(
|
|
str(vault),
|
|
recursive=True,
|
|
required_exts=[".md"],
|
|
).load_data()
|
|
|
|
embed_model = _get_embed_model(ollama_url, model)
|
|
parser = MarkdownNodeParser()
|
|
nodes = parser.get_nodes_from_documents(documents)
|
|
|
|
index = VectorStoreIndex(nodes, embed_model=embed_model)
|
|
index.storage_context.persist(persist_dir=str(idx_path))
|
|
|
|
# Save metadata so we can detect model mismatches
|
|
meta = {"model": model, "ollama_url": ollama_url, "vault_path": vault_path}
|
|
(idx_path / METADATA_FILE).write_text(json.dumps(meta, indent=2))
|
|
|
|
return idx_path
|
|
|
|
|
|
def search(
|
|
query: str,
|
|
vault_path: str = FALLBACK_VAULT,
|
|
index_dir: str | None = None,
|
|
ollama_url: str = FALLBACK_OLLAMA_URL,
|
|
top_k: int = 5,
|
|
) -> list[dict]:
|
|
"""Search the index and return matching chunks."""
|
|
idx_path = _get_index_dir(vault_path, index_dir)
|
|
|
|
if not idx_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Index not found at {idx_path}. Run 'notesearch index' first."
|
|
)
|
|
|
|
# Load metadata and check model
|
|
meta_file = idx_path / METADATA_FILE
|
|
if meta_file.exists():
|
|
meta = json.loads(meta_file.read_text())
|
|
model = meta.get("model", FALLBACK_EMBEDDING_MODEL)
|
|
else:
|
|
model = FALLBACK_EMBEDDING_MODEL
|
|
|
|
embed_model = _get_embed_model(ollama_url, model)
|
|
|
|
storage_context = StorageContext.from_defaults(persist_dir=str(idx_path))
|
|
index = load_index_from_storage(storage_context, embed_model=embed_model)
|
|
|
|
retriever = index.as_retriever(similarity_top_k=top_k)
|
|
results = retriever.retrieve(query)
|
|
|
|
return [
|
|
{
|
|
"score": round(r.score, 4),
|
|
"file": r.node.metadata.get("file_path", "unknown"),
|
|
"text": r.node.text,
|
|
}
|
|
for r in results
|
|
]
|