From e5212f7f635f67d40ae88d82e4dffb2a77985338 Mon Sep 17 00:00:00 2001 From: DarioGii Date: Sat, 15 Nov 2025 20:07:31 +0000 Subject: [PATCH] pre-process docs on upload --- .../controller/ChatbotController.java | 60 ++-- .../chatbot/ChatbotSessionRegistry.java | 16 +- .../public/locales/en-GB/translation.json | 11 +- .../core/components/chatbot/ChatbotDrawer.tsx | 306 ++++++++---------- frontend/src/core/contexts/ChatbotContext.tsx | 224 ++++++++++++- .../src/core/services/docxTextExtractor.ts | 34 ++ 6 files changed, 443 insertions(+), 208 deletions(-) create mode 100644 frontend/src/core/services/docxTextExtractor.ts diff --git a/app/proprietary/src/main/java/stirling/software/proprietary/controller/ChatbotController.java b/app/proprietary/src/main/java/stirling/software/proprietary/controller/ChatbotController.java index 9dbf49c6f..c827a7745 100644 --- a/app/proprietary/src/main/java/stirling/software/proprietary/controller/ChatbotController.java +++ b/app/proprietary/src/main/java/stirling/software/proprietary/controller/ChatbotController.java @@ -45,21 +45,7 @@ public class ChatbotController { @RequestBody ChatbotSessionCreateRequest request) { ChatbotSession session = chatbotService.createSession(request); ChatbotSettings settings = featureProperties.current(); - ChatbotSessionResponse response = - ChatbotSessionResponse.builder() - .sessionId(session.getSessionId()) - .documentId(session.getDocumentId()) - .alphaWarning(settings.alphaWarning()) - .ocrRequested(session.isOcrRequested()) - .imageContentDetected(session.isImageContentDetected()) - .textCharacters(session.getTextCharacters()) - .estimatedTokens(session.getEstimatedTokens()) - .maxCachedCharacters(cacheService.getMaxDocumentCharacters()) - .createdAt(session.getCreatedAt()) - .warnings(sessionWarnings(settings, session)) - .metadata(session.getMetadata()) - .usageSummary(session.getUsageSummary()) - .build(); + ChatbotSessionResponse response = toResponse(session, settings); return ResponseEntity.status(HttpStatus.CREATED).body(response); } @@ -76,24 +62,21 @@ public class ChatbotController { sessionRegistry .findById(sessionId) .orElseThrow(() -> new ChatbotException("Session not found")); - ChatbotSessionResponse response = - ChatbotSessionResponse.builder() - .sessionId(session.getSessionId()) - .documentId(session.getDocumentId()) - .alphaWarning(settings.alphaWarning()) - .ocrRequested(session.isOcrRequested()) - .imageContentDetected(session.isImageContentDetected()) - .textCharacters(session.getTextCharacters()) - .estimatedTokens(session.getEstimatedTokens()) - .maxCachedCharacters(cacheService.getMaxDocumentCharacters()) - .createdAt(session.getCreatedAt()) - .warnings(sessionWarnings(settings, session)) - .metadata(session.getMetadata()) - .usageSummary(session.getUsageSummary()) - .build(); + ChatbotSessionResponse response = toResponse(session, settings); return ResponseEntity.ok(response); } + @GetMapping("/document/{documentId}") + public ResponseEntity getSessionByDocument( + @PathVariable String documentId) { + ChatbotSettings settings = featureProperties.current(); + ChatbotSession session = + sessionRegistry + .findByDocumentId(documentId) + .orElseThrow(() -> new ChatbotException("Session not found")); + return ResponseEntity.ok(toResponse(session, settings)); + } + @DeleteMapping("/session/{sessionId}") public ResponseEntity closeSession(@PathVariable String sessionId) { chatbotService.close(sessionId); @@ -123,4 +106,21 @@ public class ChatbotController { return warnings; } + + private ChatbotSessionResponse toResponse(ChatbotSession session, ChatbotSettings settings) { + return ChatbotSessionResponse.builder() + .sessionId(session.getSessionId()) + .documentId(session.getDocumentId()) + .alphaWarning(settings.alphaWarning()) + .ocrRequested(session.isOcrRequested()) + .imageContentDetected(session.isImageContentDetected()) + .textCharacters(session.getTextCharacters()) + .estimatedTokens(session.getEstimatedTokens()) + .maxCachedCharacters(cacheService.getMaxDocumentCharacters()) + .createdAt(session.getCreatedAt()) + .warnings(sessionWarnings(settings, session)) + .metadata(session.getMetadata()) + .usageSummary(session.getUsageSummary()) + .build(); + } } diff --git a/app/proprietary/src/main/java/stirling/software/proprietary/service/chatbot/ChatbotSessionRegistry.java b/app/proprietary/src/main/java/stirling/software/proprietary/service/chatbot/ChatbotSessionRegistry.java index 246383db6..633a0c46d 100644 --- a/app/proprietary/src/main/java/stirling/software/proprietary/service/chatbot/ChatbotSessionRegistry.java +++ b/app/proprietary/src/main/java/stirling/software/proprietary/service/chatbot/ChatbotSessionRegistry.java @@ -12,9 +12,13 @@ import stirling.software.proprietary.model.chatbot.ChatbotSession; public class ChatbotSessionRegistry { private final Map sessionStore = new ConcurrentHashMap<>(); + private final Map documentToSession = new ConcurrentHashMap<>(); public void register(ChatbotSession session) { sessionStore.put(session.getSessionId(), session); + if (session.getDocumentId() != null) { + documentToSession.put(session.getDocumentId(), session.getSessionId()); + } } public Optional findById(String sessionId) { @@ -22,6 +26,16 @@ public class ChatbotSessionRegistry { } public void remove(String sessionId) { - sessionStore.remove(sessionId); + Optional.ofNullable(sessionStore.remove(sessionId)) + .map(ChatbotSession::getDocumentId) + .ifPresent(documentToSession::remove); + } + + public Optional findByDocumentId(String documentId) { + return Optional.ofNullable(documentToSession.get(documentId)).flatMap(this::findById); + } + + public void removeByDocumentId(String documentId) { + Optional.ofNullable(documentToSession.remove(documentId)).ifPresent(sessionStore::remove); } } diff --git a/frontend/public/locales/en-GB/translation.json b/frontend/public/locales/en-GB/translation.json index e8d262f08..bf8997a40 100644 --- a/frontend/public/locales/en-GB/translation.json +++ b/frontend/public/locales/en-GB/translation.json @@ -5145,14 +5145,15 @@ "title": "Stirling PDF Bot", "alphaBadge": "Alpha", "alphaTitle": "Experimental feature", - "alphaDescription": "Chatbot is in currently in alpha and is subject to change. Responses may be imperfect, please check responses.", + "alphaDescription": "This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.", + "acceptAlphaLabel": "I understand this feature is experimental and image content is not supported yet.", "fileLabel": "Document to query", "filePlaceholder": "Select an uploaded PDF", "noFiles": "Upload a PDF from File Manager to start chatting.", "ocrToggle": "Run OCR before extracting text (uses more resources)", "ocrHint": "Enable when your PDF is a scan or contains images.", - "refreshButton": "Re-sync document", - "startButton": "Send document to chat", + "refreshButton": "Reprocess document", + "startButton": "Prepare document for chat", "sessionSummary": "Context summary", "contextDetails": "{{pages}} pages · {{chars}} characters synced", "conversationTitle": "Conversation", @@ -5172,6 +5173,10 @@ "toolHint": "The chat window slides in from the left. If it is already open, this button simply focuses it and passes along the currently selected PDF.", "toolTitleMenu": "Chatbot (Alpha)", "toolMenuDescription": "Chat with Stirling Bot about the contents of your PDF.", + "errors": { + "preprocessing": "Unable to prepare this document.", + "unsupported": "Unsupported document type." + }, "status": { "runningOcr": "Running OCR and extracting text…", "extracting": "Extracting text from PDF…", diff --git a/frontend/src/core/components/chatbot/ChatbotDrawer.tsx b/frontend/src/core/components/chatbot/ChatbotDrawer.tsx index 4d0c99848..de6557212 100644 --- a/frontend/src/core/components/chatbot/ChatbotDrawer.tsx +++ b/frontend/src/core/components/chatbot/ChatbotDrawer.tsx @@ -18,17 +18,14 @@ import { useTranslation } from 'react-i18next'; import SmartToyRoundedIcon from '@mui/icons-material/SmartToyRounded'; import WarningAmberRoundedIcon from '@mui/icons-material/WarningAmberRounded'; import SendRoundedIcon from '@mui/icons-material/SendRounded'; -import CloseRoundedIcon from '@mui/icons-material/CloseRounded'; +import RefreshRoundedIcon from '@mui/icons-material/RefreshRounded'; import { useChatbot } from '@app/contexts/ChatbotContext'; import { useFileState } from '@app/contexts/FileContext'; -import { extractTextFromPdf } from '@app/services/pdfTextExtractor'; -import { runOcrForChat } from '@app/services/chatbotOcrService'; import { ChatbotMessageResponse, ChatbotSessionInfo, ChatbotUsageSummary, - createChatbotSession, sendChatbotPrompt, } from '@app/services/chatbotService'; import { useToast } from '@app/components/toast'; @@ -52,28 +49,32 @@ function createMessageId() { } const MAX_PROMPT_CHARS = 4000; +const ALPHA_ACK_KEY = 'stirling.chatbot.alphaAck'; const ChatbotDrawer = () => { const { t } = useTranslation(); const isMobile = useMediaQuery('(max-width: 768px)'); const { width: viewportWidth, height: viewportHeight } = useViewportSize(); - const { isOpen, closeChat, preferredFileId, setPreferredFileId } = useChatbot(); + const { + isOpen, + closeChat, + preferredFileId, + setPreferredFileId, + sessions: preparedSessions, + requestPreprocessing, + } = useChatbot(); const { selectors } = useFileState(); const { sidebarRefs } = useSidebarContext(); const { show } = useToast(); const files = selectors.getFiles(); const [selectedFileId, setSelectedFileId] = useState(); + const [alphaAccepted, setAlphaAccepted] = useState(false); const [runOcr, setRunOcr] = useState(false); const [isStartingSession, setIsStartingSession] = useState(false); const [isSendingMessage, setIsSendingMessage] = useState(false); - const [statusMessage, setStatusMessage] = useState(''); - const [sessionInfo, setSessionInfo] = useState(null); - const [contextStats, setContextStats] = useState<{ pageCount: number; characterCount: number } | null>(null); const [messages, setMessages] = useState([]); const [prompt, setPrompt] = useState(''); const [warnings, setWarnings] = useState([]); - const [noTextModalOpen, setNoTextModalOpen] = useState(false); - const [pendingOcrRetry, setPendingOcrRetry] = useState(false); const scrollViewportRef = useRef(null); const [panelAnchor, setPanelAnchor] = useState<{ right: number; top: number } | null>(null); const usageAlertState = useRef<'none' | 'warned' | 'limit'>('none'); @@ -82,6 +83,50 @@ const ChatbotDrawer = () => { () => files.find((file) => file.fileId === selectedFileId), [files, selectedFileId] ); + const selectedSessionEntry = selectedFileId + ? preparedSessions[selectedFileId] + : undefined; + const sessionStatus = selectedSessionEntry?.status ?? 'idle'; + const sessionError = selectedSessionEntry?.error; + const sessionInfo: ChatbotSessionInfo | null = selectedSessionEntry?.session ?? null; + const contextStats = + selectedSessionEntry?.status === 'ready' && selectedSessionEntry?.characterCount !== undefined + ? { + pageCount: selectedSessionEntry.pageCount ?? 0, + characterCount: selectedSessionEntry.characterCount ?? 0, + } + : null; + const preparationWarnings = selectedSessionEntry?.warnings ?? []; + const derivedStatusMessage = useMemo(() => { + if (!alphaAccepted) { + return t('chatbot.autoSyncPrompt', 'Acknowledge the alpha notice to start syncing automatically.'); + } + if (sessionStatus === 'processing' || isStartingSession) { + return t('chatbot.status.syncing', 'Preparing document for chat…'); + } + if (sessionStatus === 'error') { + return sessionError || t('chatbot.errors.preprocessing', 'Unable to prepare this document.'); + } + if (sessionStatus === 'unsupported') { + return sessionError || t('chatbot.errors.unsupported', 'Unsupported document type.'); + } + return null; + }, [alphaAccepted, sessionStatus, sessionError, isStartingSession, t]); + const assistantWarnings = useMemo( + () => [...preparationWarnings, ...warnings.filter(Boolean)], + [preparationWarnings, warnings] + ); + + useEffect(() => { + if (!isOpen) { + return; + } + const storedAck = + typeof window !== 'undefined' + ? window.localStorage.getItem(ALPHA_ACK_KEY) === 'true' + : false; + setAlphaAccepted(storedAck); + }, [isOpen]); useEffect(() => { if (!isOpen) { @@ -113,8 +158,16 @@ const ChatbotDrawer = () => { useEffect(() => { usageAlertState.current = 'none'; + if (sessionInfo) { + maybeShowUsageWarning(sessionInfo.usageSummary); + } }, [sessionInfo?.sessionId]); + useEffect(() => { + setMessages([]); + setWarnings([]); + }, [selectedFileId]); + const maybeShowUsageWarning = (usage?: ChatbotUsageSummary | null) => { if (!usage) { return; @@ -144,17 +197,6 @@ const ChatbotDrawer = () => { } }; - useEffect(() => { - if (sessionInfo && sessionInfo.documentId !== selectedFileId) { - setSessionInfo(null); - setContextStats(null); - setMessages([]); - setWarnings([]); - setPendingOcrRetry(false); - setNoTextModalOpen(false); - } - }, [sessionInfo, selectedFileId]); - useLayoutEffect(() => { if (isMobile || !isOpen) { setPanelAnchor(null); @@ -183,15 +225,6 @@ const ChatbotDrawer = () => { }; }, [isMobile, isOpen, sidebarRefs.toolPanelRef]); - const withStatus = async (label: string, fn: () => Promise): Promise => { - setStatusMessage(label); - try { - return await fn(); - } finally { - setStatusMessage(''); - } - }; - const ensureFileSelected = () => { if (!selectedFile) { show({ @@ -204,65 +237,27 @@ const ChatbotDrawer = () => { return true; }; - const handleSessionStart = async (forceOcr?: boolean) => { - if (!ensureFileSelected() || !selectedFile) { + const handleAlphaAccept = (checked: boolean) => { + setAlphaAccepted(checked); + if (typeof window !== 'undefined') { + if (checked) { + window.localStorage.setItem(ALPHA_ACK_KEY, 'true'); + } else { + window.localStorage.removeItem(ALPHA_ACK_KEY); + } + } + }; + + const handleManualPrepare = async (forceOcr?: boolean) => { + if (!ensureFileSelected() || !selectedFileId) { return; } setIsStartingSession(true); try { - let workingFile: File = selectedFile; - const shouldRunOcr = forceOcr ?? runOcr; - - const extractionResult = await withStatus( - shouldRunOcr - ? t('chatbot.status.runningOcr', 'Running OCR and extracting text…') - : t('chatbot.status.extracting', 'Extracting text from PDF…'), - async () => { - if (shouldRunOcr) { - workingFile = await runOcrForChat(selectedFile); - } - return extractTextFromPdf(workingFile); - } - ); - - if (!extractionResult.text || extractionResult.text.trim().length === 0) { - setPendingOcrRetry(true); - setNoTextModalOpen(true); - return; - } - - const metadata = { - name: workingFile.name, - size: String(workingFile.size), - pageCount: String(extractionResult.pageCount), - }; - - const sessionPayload = { - sessionId: sessionInfo?.sessionId, - documentId: selectedFile.fileId, - text: extractionResult.text, - metadata, - ocrRequested: shouldRunOcr, - warningsAccepted: true, - }; - - const response = await withStatus( - t('chatbot.status.syncing', 'Syncing document with Stirling Bot…'), - () => createChatbotSession(sessionPayload) - ); - - setSessionInfo(response); - maybeShowUsageWarning(response.usageSummary); - setContextStats({ - pageCount: extractionResult.pageCount, - characterCount: extractionResult.characterCount, - }); - setMessages([]); - setWarnings(response.warnings ?? []); - setPendingOcrRetry(false); - setNoTextModalOpen(false); + await requestPreprocessing(selectedFileId, { force: true, forceOcr: forceOcr ?? runOcr }); + usageAlertState.current = 'none'; } catch (error) { - console.error('[Chatbot] Failed to start session', error); + console.error('[Chatbot] Failed to prepare document', error); show({ alertType: 'error', title: t('chatbot.toasts.failedSessionTitle', 'Could not prepare document'), @@ -270,42 +265,11 @@ const ChatbotDrawer = () => { }); } finally { setIsStartingSession(false); - setStatusMessage(''); } }; - useEffect(() => { - if ( - !isOpen || - !selectedFile || - sessionInfo || - isStartingSession || - pendingOcrRetry || - noTextModalOpen - ) { - return; - } - let cancelled = false; - handleSessionStart().catch((error) => { - if (!cancelled) { - console.error('[Chatbot] Auto-sync failed', error); - } - }); - return () => { - cancelled = true; - }; - }, [isOpen, selectedFile, sessionInfo, isStartingSession, pendingOcrRetry, noTextModalOpen, runOcr]); - - useEffect(() => { - if (!sessionInfo) { - return; - } - setSessionInfo(null); - setContextStats(null); - }, [runOcr]); - const handleSendMessage = async () => { - if (!sessionInfo) { + if (!sessionInfo || sessionStatus !== 'ready') { show({ alertType: 'neutral', title: t('chatbot.toasts.noSessionTitle', 'Sync your document first'), @@ -369,7 +333,8 @@ const ChatbotDrawer = () => { [files] ); - const disablePromptInput = !sessionInfo || isStartingSession || isSendingMessage; + const disablePromptInput = + !sessionInfo || sessionStatus !== 'ready' || isStartingSession || isSendingMessage; const canSend = !disablePromptInput && prompt.trim().length > 0; const handlePromptKeyDown = (event: KeyboardEvent) => { @@ -395,7 +360,6 @@ const ChatbotDrawer = () => { ); - const assistantWarnings = warnings.filter(Boolean); const safeViewportWidth = viewportWidth || (typeof window !== 'undefined' ? window.innerWidth : 1280); @@ -497,6 +461,28 @@ const ChatbotDrawer = () => { transitionProps={{ transition: 'slide-left', duration: 200 }} > + + + + {t('chatbot.alphaTitle', 'Experimental feature')} + + {t( + 'chatbot.alphaDescription', + 'This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.' + )} + + +