from __future__ import annotations from collections.abc import Iterator import pytest from fastapi.testclient import TestClient from stirling.api import app from stirling.api.dependencies import get_rag_service from stirling.models import FileId from stirling.rag import Document, RagService, SqliteVecStore class StubEmbedder: """Deterministic embeddings for route tests: no network, no provider needed.""" def __init__(self, dim: int = 8) -> None: self._dim = dim async def embed_query(self, text: str) -> list[float]: h = hash(text) % 1000 return [(h + i) / 1000.0 for i in range(self._dim)] async def embed_documents(self, texts: list[str]) -> list[list[float]]: return [await self.embed_query(t) for t in texts] def chunk_and_prepare( self, text: str, source: str = "", base_metadata: dict[str, str] | None = None, ) -> list[Document]: from stirling.rag.chunker import chunk_text chunks = chunk_text(text, 100, 10) docs = [] for i, chunk in enumerate(chunks): meta = dict(base_metadata) if base_metadata else {} meta["source"] = source meta["chunk_index"] = str(i) doc_id = f"{source}:chunk:{i}" if source else f"chunk:{i}" docs.append(Document(id=doc_id, text=chunk, metadata=meta)) return docs def _build_service() -> RagService: return RagService( embedder=StubEmbedder(), # type: ignore[arg-type] store=SqliteVecStore.ephemeral(), default_top_k=3, ) @pytest.fixture def service() -> RagService: return _build_service() @pytest.fixture def client(service: RagService) -> Iterator[TestClient]: app.dependency_overrides[get_rag_service] = lambda: service try: yield TestClient(app) finally: app.dependency_overrides.pop(get_rag_service, None) # ── POST /documents ───────────────────────────────────────────────────── def test_ingest_document_indexes_page_text(client: TestClient, service: RagService) -> None: response = client.post( "/api/v1/rag/documents", json={ "documentId": "doc-123", "source": "report.pdf", "pageText": [ {"pageNumber": 1, "text": "The introduction covers the main topic."}, {"pageNumber": 2, "text": "The conclusion summarises the findings."}, ], }, ) assert response.status_code == 200 body = response.json() assert body["documentId"] == "doc-123" assert body["chunksIndexed"] >= 2 @pytest.mark.anyio async def test_ingest_document_replaces_existing_content(client: TestClient, service: RagService) -> None: client.post( "/api/v1/rag/documents", json={ "documentId": "replace-me", "source": "replace-me.pdf", "pageText": [{"pageNumber": 1, "text": "Original content that existed before."}], }, ) # Second ingest with different content should replace the first entirely response = client.post( "/api/v1/rag/documents", json={ "documentId": "replace-me", "source": "replace-me.pdf", "pageText": [{"pageNumber": 1, "text": "New content that replaced the old."}], }, ) assert response.status_code == 200 results = await service.search("New content", collection=FileId("replace-me"), top_k=5) texts = [r.document.text for r in results] assert any("New content" in t for t in texts) assert not any("Original content" in t for t in texts) def test_ingest_document_skips_empty_pages(client: TestClient) -> None: response = client.post( "/api/v1/rag/documents", json={ "documentId": "mixed", "source": "mixed.pdf", "pageText": [ {"pageNumber": 1, "text": " "}, {"pageNumber": 2, "text": "Real content on page 2."}, ], }, ) assert response.status_code == 200 assert response.json()["chunksIndexed"] >= 1 def test_ingest_document_with_no_content_returns_zero(client: TestClient) -> None: response = client.post("/api/v1/rag/documents", json={"documentId": "empty", "source": "empty.pdf"}) assert response.status_code == 200 assert response.json()["chunksIndexed"] == 0 def test_ingest_document_rejects_empty_id(client: TestClient) -> None: response = client.post( "/api/v1/rag/documents", json={"documentId": "", "source": "x.pdf", "pageText": [{"pageNumber": 1, "text": "something"}]}, ) assert response.status_code == 422 def test_ingest_document_rejects_missing_source(client: TestClient) -> None: response = client.post( "/api/v1/rag/documents", json={"documentId": "doc-1", "pageText": [{"pageNumber": 1, "text": "something"}]}, ) assert response.status_code == 422 def test_ingest_document_rejects_empty_source(client: TestClient) -> None: response = client.post( "/api/v1/rag/documents", json={"documentId": "doc-1", "source": "", "pageText": [{"pageNumber": 1, "text": "something"}]}, ) assert response.status_code == 422 def test_ingest_document_rejects_non_positive_page_number(client: TestClient) -> None: response = client.post( "/api/v1/rag/documents", json={ "documentId": "bad-page", "source": "bad-page.pdf", "pageText": [{"pageNumber": 0, "text": "something"}], }, ) assert response.status_code == 422 # ── DELETE /documents/{id} ────────────────────────────────────────────── def test_delete_document_reports_deleted_true_when_existed(client: TestClient) -> None: client.post( "/api/v1/rag/documents", json={ "documentId": "to-delete", "source": "to-delete.pdf", "pageText": [{"pageNumber": 1, "text": "Text."}], }, ) response = client.delete("/api/v1/rag/documents/to-delete") assert response.status_code == 200 assert response.json() == {"documentId": "to-delete", "deleted": True} def test_delete_document_is_idempotent(client: TestClient) -> None: response = client.delete("/api/v1/rag/documents/never-existed") assert response.status_code == 200 assert response.json() == {"documentId": "never-existed", "deleted": False} @pytest.mark.anyio async def test_delete_document_removes_collection(client: TestClient, service: RagService) -> None: client.post( "/api/v1/rag/documents", json={"documentId": "gone", "source": "gone.pdf", "pageText": [{"pageNumber": 1, "text": "Text."}]}, ) assert await service.has_collection(FileId("gone")) client.delete("/api/v1/rag/documents/gone") assert not await service.has_collection(FileId("gone"))