mirror of
https://github.com/Frooodle/Stirling-PDF.git
synced 2025-11-16 01:21:16 +01:00
pre-process docs on upload
This commit is contained in:
parent
9ffbede49a
commit
e5212f7f63
@ -45,21 +45,7 @@ public class ChatbotController {
|
||||
@RequestBody ChatbotSessionCreateRequest request) {
|
||||
ChatbotSession session = chatbotService.createSession(request);
|
||||
ChatbotSettings settings = featureProperties.current();
|
||||
ChatbotSessionResponse response =
|
||||
ChatbotSessionResponse.builder()
|
||||
.sessionId(session.getSessionId())
|
||||
.documentId(session.getDocumentId())
|
||||
.alphaWarning(settings.alphaWarning())
|
||||
.ocrRequested(session.isOcrRequested())
|
||||
.imageContentDetected(session.isImageContentDetected())
|
||||
.textCharacters(session.getTextCharacters())
|
||||
.estimatedTokens(session.getEstimatedTokens())
|
||||
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
|
||||
.createdAt(session.getCreatedAt())
|
||||
.warnings(sessionWarnings(settings, session))
|
||||
.metadata(session.getMetadata())
|
||||
.usageSummary(session.getUsageSummary())
|
||||
.build();
|
||||
ChatbotSessionResponse response = toResponse(session, settings);
|
||||
return ResponseEntity.status(HttpStatus.CREATED).body(response);
|
||||
}
|
||||
|
||||
@ -76,24 +62,21 @@ public class ChatbotController {
|
||||
sessionRegistry
|
||||
.findById(sessionId)
|
||||
.orElseThrow(() -> new ChatbotException("Session not found"));
|
||||
ChatbotSessionResponse response =
|
||||
ChatbotSessionResponse.builder()
|
||||
.sessionId(session.getSessionId())
|
||||
.documentId(session.getDocumentId())
|
||||
.alphaWarning(settings.alphaWarning())
|
||||
.ocrRequested(session.isOcrRequested())
|
||||
.imageContentDetected(session.isImageContentDetected())
|
||||
.textCharacters(session.getTextCharacters())
|
||||
.estimatedTokens(session.getEstimatedTokens())
|
||||
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
|
||||
.createdAt(session.getCreatedAt())
|
||||
.warnings(sessionWarnings(settings, session))
|
||||
.metadata(session.getMetadata())
|
||||
.usageSummary(session.getUsageSummary())
|
||||
.build();
|
||||
ChatbotSessionResponse response = toResponse(session, settings);
|
||||
return ResponseEntity.ok(response);
|
||||
}
|
||||
|
||||
@GetMapping("/document/{documentId}")
|
||||
public ResponseEntity<ChatbotSessionResponse> getSessionByDocument(
|
||||
@PathVariable String documentId) {
|
||||
ChatbotSettings settings = featureProperties.current();
|
||||
ChatbotSession session =
|
||||
sessionRegistry
|
||||
.findByDocumentId(documentId)
|
||||
.orElseThrow(() -> new ChatbotException("Session not found"));
|
||||
return ResponseEntity.ok(toResponse(session, settings));
|
||||
}
|
||||
|
||||
@DeleteMapping("/session/{sessionId}")
|
||||
public ResponseEntity<Void> closeSession(@PathVariable String sessionId) {
|
||||
chatbotService.close(sessionId);
|
||||
@ -123,4 +106,21 @@ public class ChatbotController {
|
||||
|
||||
return warnings;
|
||||
}
|
||||
|
||||
private ChatbotSessionResponse toResponse(ChatbotSession session, ChatbotSettings settings) {
|
||||
return ChatbotSessionResponse.builder()
|
||||
.sessionId(session.getSessionId())
|
||||
.documentId(session.getDocumentId())
|
||||
.alphaWarning(settings.alphaWarning())
|
||||
.ocrRequested(session.isOcrRequested())
|
||||
.imageContentDetected(session.isImageContentDetected())
|
||||
.textCharacters(session.getTextCharacters())
|
||||
.estimatedTokens(session.getEstimatedTokens())
|
||||
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
|
||||
.createdAt(session.getCreatedAt())
|
||||
.warnings(sessionWarnings(settings, session))
|
||||
.metadata(session.getMetadata())
|
||||
.usageSummary(session.getUsageSummary())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
@ -12,9 +12,13 @@ import stirling.software.proprietary.model.chatbot.ChatbotSession;
|
||||
public class ChatbotSessionRegistry {
|
||||
|
||||
private final Map<String, ChatbotSession> sessionStore = new ConcurrentHashMap<>();
|
||||
private final Map<String, String> documentToSession = new ConcurrentHashMap<>();
|
||||
|
||||
public void register(ChatbotSession session) {
|
||||
sessionStore.put(session.getSessionId(), session);
|
||||
if (session.getDocumentId() != null) {
|
||||
documentToSession.put(session.getDocumentId(), session.getSessionId());
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<ChatbotSession> findById(String sessionId) {
|
||||
@ -22,6 +26,16 @@ public class ChatbotSessionRegistry {
|
||||
}
|
||||
|
||||
public void remove(String sessionId) {
|
||||
sessionStore.remove(sessionId);
|
||||
Optional.ofNullable(sessionStore.remove(sessionId))
|
||||
.map(ChatbotSession::getDocumentId)
|
||||
.ifPresent(documentToSession::remove);
|
||||
}
|
||||
|
||||
public Optional<ChatbotSession> findByDocumentId(String documentId) {
|
||||
return Optional.ofNullable(documentToSession.get(documentId)).flatMap(this::findById);
|
||||
}
|
||||
|
||||
public void removeByDocumentId(String documentId) {
|
||||
Optional.ofNullable(documentToSession.remove(documentId)).ifPresent(sessionStore::remove);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5145,14 +5145,15 @@
|
||||
"title": "Stirling PDF Bot",
|
||||
"alphaBadge": "Alpha",
|
||||
"alphaTitle": "Experimental feature",
|
||||
"alphaDescription": "Chatbot is in currently in alpha and is subject to change. Responses may be imperfect, please check responses.",
|
||||
"alphaDescription": "This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.",
|
||||
"acceptAlphaLabel": "I understand this feature is experimental and image content is not supported yet.",
|
||||
"fileLabel": "Document to query",
|
||||
"filePlaceholder": "Select an uploaded PDF",
|
||||
"noFiles": "Upload a PDF from File Manager to start chatting.",
|
||||
"ocrToggle": "Run OCR before extracting text (uses more resources)",
|
||||
"ocrHint": "Enable when your PDF is a scan or contains images.",
|
||||
"refreshButton": "Re-sync document",
|
||||
"startButton": "Send document to chat",
|
||||
"refreshButton": "Reprocess document",
|
||||
"startButton": "Prepare document for chat",
|
||||
"sessionSummary": "Context summary",
|
||||
"contextDetails": "{{pages}} pages · {{chars}} characters synced",
|
||||
"conversationTitle": "Conversation",
|
||||
@ -5172,6 +5173,10 @@
|
||||
"toolHint": "The chat window slides in from the left. If it is already open, this button simply focuses it and passes along the currently selected PDF.",
|
||||
"toolTitleMenu": "Chatbot (Alpha)",
|
||||
"toolMenuDescription": "Chat with Stirling Bot about the contents of your PDF.",
|
||||
"errors": {
|
||||
"preprocessing": "Unable to prepare this document.",
|
||||
"unsupported": "Unsupported document type."
|
||||
},
|
||||
"status": {
|
||||
"runningOcr": "Running OCR and extracting text…",
|
||||
"extracting": "Extracting text from PDF…",
|
||||
|
||||
@ -18,17 +18,14 @@ import { useTranslation } from 'react-i18next';
|
||||
import SmartToyRoundedIcon from '@mui/icons-material/SmartToyRounded';
|
||||
import WarningAmberRoundedIcon from '@mui/icons-material/WarningAmberRounded';
|
||||
import SendRoundedIcon from '@mui/icons-material/SendRounded';
|
||||
import CloseRoundedIcon from '@mui/icons-material/CloseRounded';
|
||||
import RefreshRoundedIcon from '@mui/icons-material/RefreshRounded';
|
||||
|
||||
import { useChatbot } from '@app/contexts/ChatbotContext';
|
||||
import { useFileState } from '@app/contexts/FileContext';
|
||||
import { extractTextFromPdf } from '@app/services/pdfTextExtractor';
|
||||
import { runOcrForChat } from '@app/services/chatbotOcrService';
|
||||
import {
|
||||
ChatbotMessageResponse,
|
||||
ChatbotSessionInfo,
|
||||
ChatbotUsageSummary,
|
||||
createChatbotSession,
|
||||
sendChatbotPrompt,
|
||||
} from '@app/services/chatbotService';
|
||||
import { useToast } from '@app/components/toast';
|
||||
@ -52,28 +49,32 @@ function createMessageId() {
|
||||
}
|
||||
|
||||
const MAX_PROMPT_CHARS = 4000;
|
||||
const ALPHA_ACK_KEY = 'stirling.chatbot.alphaAck';
|
||||
|
||||
const ChatbotDrawer = () => {
|
||||
const { t } = useTranslation();
|
||||
const isMobile = useMediaQuery('(max-width: 768px)');
|
||||
const { width: viewportWidth, height: viewportHeight } = useViewportSize();
|
||||
const { isOpen, closeChat, preferredFileId, setPreferredFileId } = useChatbot();
|
||||
const {
|
||||
isOpen,
|
||||
closeChat,
|
||||
preferredFileId,
|
||||
setPreferredFileId,
|
||||
sessions: preparedSessions,
|
||||
requestPreprocessing,
|
||||
} = useChatbot();
|
||||
const { selectors } = useFileState();
|
||||
const { sidebarRefs } = useSidebarContext();
|
||||
const { show } = useToast();
|
||||
const files = selectors.getFiles();
|
||||
const [selectedFileId, setSelectedFileId] = useState<string | undefined>();
|
||||
const [alphaAccepted, setAlphaAccepted] = useState(false);
|
||||
const [runOcr, setRunOcr] = useState(false);
|
||||
const [isStartingSession, setIsStartingSession] = useState(false);
|
||||
const [isSendingMessage, setIsSendingMessage] = useState(false);
|
||||
const [statusMessage, setStatusMessage] = useState<string>('');
|
||||
const [sessionInfo, setSessionInfo] = useState<ChatbotSessionInfo | null>(null);
|
||||
const [contextStats, setContextStats] = useState<{ pageCount: number; characterCount: number } | null>(null);
|
||||
const [messages, setMessages] = useState<ChatMessage[]>([]);
|
||||
const [prompt, setPrompt] = useState('');
|
||||
const [warnings, setWarnings] = useState<string[]>([]);
|
||||
const [noTextModalOpen, setNoTextModalOpen] = useState(false);
|
||||
const [pendingOcrRetry, setPendingOcrRetry] = useState(false);
|
||||
const scrollViewportRef = useRef<HTMLDivElement>(null);
|
||||
const [panelAnchor, setPanelAnchor] = useState<{ right: number; top: number } | null>(null);
|
||||
const usageAlertState = useRef<'none' | 'warned' | 'limit'>('none');
|
||||
@ -82,6 +83,50 @@ const ChatbotDrawer = () => {
|
||||
() => files.find((file) => file.fileId === selectedFileId),
|
||||
[files, selectedFileId]
|
||||
);
|
||||
const selectedSessionEntry = selectedFileId
|
||||
? preparedSessions[selectedFileId]
|
||||
: undefined;
|
||||
const sessionStatus = selectedSessionEntry?.status ?? 'idle';
|
||||
const sessionError = selectedSessionEntry?.error;
|
||||
const sessionInfo: ChatbotSessionInfo | null = selectedSessionEntry?.session ?? null;
|
||||
const contextStats =
|
||||
selectedSessionEntry?.status === 'ready' && selectedSessionEntry?.characterCount !== undefined
|
||||
? {
|
||||
pageCount: selectedSessionEntry.pageCount ?? 0,
|
||||
characterCount: selectedSessionEntry.characterCount ?? 0,
|
||||
}
|
||||
: null;
|
||||
const preparationWarnings = selectedSessionEntry?.warnings ?? [];
|
||||
const derivedStatusMessage = useMemo(() => {
|
||||
if (!alphaAccepted) {
|
||||
return t('chatbot.autoSyncPrompt', 'Acknowledge the alpha notice to start syncing automatically.');
|
||||
}
|
||||
if (sessionStatus === 'processing' || isStartingSession) {
|
||||
return t('chatbot.status.syncing', 'Preparing document for chat…');
|
||||
}
|
||||
if (sessionStatus === 'error') {
|
||||
return sessionError || t('chatbot.errors.preprocessing', 'Unable to prepare this document.');
|
||||
}
|
||||
if (sessionStatus === 'unsupported') {
|
||||
return sessionError || t('chatbot.errors.unsupported', 'Unsupported document type.');
|
||||
}
|
||||
return null;
|
||||
}, [alphaAccepted, sessionStatus, sessionError, isStartingSession, t]);
|
||||
const assistantWarnings = useMemo(
|
||||
() => [...preparationWarnings, ...warnings.filter(Boolean)],
|
||||
[preparationWarnings, warnings]
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isOpen) {
|
||||
return;
|
||||
}
|
||||
const storedAck =
|
||||
typeof window !== 'undefined'
|
||||
? window.localStorage.getItem(ALPHA_ACK_KEY) === 'true'
|
||||
: false;
|
||||
setAlphaAccepted(storedAck);
|
||||
}, [isOpen]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isOpen) {
|
||||
@ -113,8 +158,16 @@ const ChatbotDrawer = () => {
|
||||
|
||||
useEffect(() => {
|
||||
usageAlertState.current = 'none';
|
||||
if (sessionInfo) {
|
||||
maybeShowUsageWarning(sessionInfo.usageSummary);
|
||||
}
|
||||
}, [sessionInfo?.sessionId]);
|
||||
|
||||
useEffect(() => {
|
||||
setMessages([]);
|
||||
setWarnings([]);
|
||||
}, [selectedFileId]);
|
||||
|
||||
const maybeShowUsageWarning = (usage?: ChatbotUsageSummary | null) => {
|
||||
if (!usage) {
|
||||
return;
|
||||
@ -144,17 +197,6 @@ const ChatbotDrawer = () => {
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (sessionInfo && sessionInfo.documentId !== selectedFileId) {
|
||||
setSessionInfo(null);
|
||||
setContextStats(null);
|
||||
setMessages([]);
|
||||
setWarnings([]);
|
||||
setPendingOcrRetry(false);
|
||||
setNoTextModalOpen(false);
|
||||
}
|
||||
}, [sessionInfo, selectedFileId]);
|
||||
|
||||
useLayoutEffect(() => {
|
||||
if (isMobile || !isOpen) {
|
||||
setPanelAnchor(null);
|
||||
@ -183,15 +225,6 @@ const ChatbotDrawer = () => {
|
||||
};
|
||||
}, [isMobile, isOpen, sidebarRefs.toolPanelRef]);
|
||||
|
||||
const withStatus = async <T,>(label: string, fn: () => Promise<T>): Promise<T> => {
|
||||
setStatusMessage(label);
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
setStatusMessage('');
|
||||
}
|
||||
};
|
||||
|
||||
const ensureFileSelected = () => {
|
||||
if (!selectedFile) {
|
||||
show({
|
||||
@ -204,65 +237,27 @@ const ChatbotDrawer = () => {
|
||||
return true;
|
||||
};
|
||||
|
||||
const handleSessionStart = async (forceOcr?: boolean) => {
|
||||
if (!ensureFileSelected() || !selectedFile) {
|
||||
const handleAlphaAccept = (checked: boolean) => {
|
||||
setAlphaAccepted(checked);
|
||||
if (typeof window !== 'undefined') {
|
||||
if (checked) {
|
||||
window.localStorage.setItem(ALPHA_ACK_KEY, 'true');
|
||||
} else {
|
||||
window.localStorage.removeItem(ALPHA_ACK_KEY);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const handleManualPrepare = async (forceOcr?: boolean) => {
|
||||
if (!ensureFileSelected() || !selectedFileId) {
|
||||
return;
|
||||
}
|
||||
setIsStartingSession(true);
|
||||
try {
|
||||
let workingFile: File = selectedFile;
|
||||
const shouldRunOcr = forceOcr ?? runOcr;
|
||||
|
||||
const extractionResult = await withStatus(
|
||||
shouldRunOcr
|
||||
? t('chatbot.status.runningOcr', 'Running OCR and extracting text…')
|
||||
: t('chatbot.status.extracting', 'Extracting text from PDF…'),
|
||||
async () => {
|
||||
if (shouldRunOcr) {
|
||||
workingFile = await runOcrForChat(selectedFile);
|
||||
}
|
||||
return extractTextFromPdf(workingFile);
|
||||
}
|
||||
);
|
||||
|
||||
if (!extractionResult.text || extractionResult.text.trim().length === 0) {
|
||||
setPendingOcrRetry(true);
|
||||
setNoTextModalOpen(true);
|
||||
return;
|
||||
}
|
||||
|
||||
const metadata = {
|
||||
name: workingFile.name,
|
||||
size: String(workingFile.size),
|
||||
pageCount: String(extractionResult.pageCount),
|
||||
};
|
||||
|
||||
const sessionPayload = {
|
||||
sessionId: sessionInfo?.sessionId,
|
||||
documentId: selectedFile.fileId,
|
||||
text: extractionResult.text,
|
||||
metadata,
|
||||
ocrRequested: shouldRunOcr,
|
||||
warningsAccepted: true,
|
||||
};
|
||||
|
||||
const response = await withStatus(
|
||||
t('chatbot.status.syncing', 'Syncing document with Stirling Bot…'),
|
||||
() => createChatbotSession(sessionPayload)
|
||||
);
|
||||
|
||||
setSessionInfo(response);
|
||||
maybeShowUsageWarning(response.usageSummary);
|
||||
setContextStats({
|
||||
pageCount: extractionResult.pageCount,
|
||||
characterCount: extractionResult.characterCount,
|
||||
});
|
||||
setMessages([]);
|
||||
setWarnings(response.warnings ?? []);
|
||||
setPendingOcrRetry(false);
|
||||
setNoTextModalOpen(false);
|
||||
await requestPreprocessing(selectedFileId, { force: true, forceOcr: forceOcr ?? runOcr });
|
||||
usageAlertState.current = 'none';
|
||||
} catch (error) {
|
||||
console.error('[Chatbot] Failed to start session', error);
|
||||
console.error('[Chatbot] Failed to prepare document', error);
|
||||
show({
|
||||
alertType: 'error',
|
||||
title: t('chatbot.toasts.failedSessionTitle', 'Could not prepare document'),
|
||||
@ -270,42 +265,11 @@ const ChatbotDrawer = () => {
|
||||
});
|
||||
} finally {
|
||||
setIsStartingSession(false);
|
||||
setStatusMessage('');
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (
|
||||
!isOpen ||
|
||||
!selectedFile ||
|
||||
sessionInfo ||
|
||||
isStartingSession ||
|
||||
pendingOcrRetry ||
|
||||
noTextModalOpen
|
||||
) {
|
||||
return;
|
||||
}
|
||||
let cancelled = false;
|
||||
handleSessionStart().catch((error) => {
|
||||
if (!cancelled) {
|
||||
console.error('[Chatbot] Auto-sync failed', error);
|
||||
}
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [isOpen, selectedFile, sessionInfo, isStartingSession, pendingOcrRetry, noTextModalOpen, runOcr]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!sessionInfo) {
|
||||
return;
|
||||
}
|
||||
setSessionInfo(null);
|
||||
setContextStats(null);
|
||||
}, [runOcr]);
|
||||
|
||||
const handleSendMessage = async () => {
|
||||
if (!sessionInfo) {
|
||||
if (!sessionInfo || sessionStatus !== 'ready') {
|
||||
show({
|
||||
alertType: 'neutral',
|
||||
title: t('chatbot.toasts.noSessionTitle', 'Sync your document first'),
|
||||
@ -369,7 +333,8 @@ const ChatbotDrawer = () => {
|
||||
[files]
|
||||
);
|
||||
|
||||
const disablePromptInput = !sessionInfo || isStartingSession || isSendingMessage;
|
||||
const disablePromptInput =
|
||||
!sessionInfo || sessionStatus !== 'ready' || isStartingSession || isSendingMessage;
|
||||
const canSend = !disablePromptInput && prompt.trim().length > 0;
|
||||
|
||||
const handlePromptKeyDown = (event: KeyboardEvent<HTMLTextAreaElement>) => {
|
||||
@ -395,7 +360,6 @@ const ChatbotDrawer = () => {
|
||||
</Group>
|
||||
);
|
||||
|
||||
const assistantWarnings = warnings.filter(Boolean);
|
||||
|
||||
const safeViewportWidth =
|
||||
viewportWidth || (typeof window !== 'undefined' ? window.innerWidth : 1280);
|
||||
@ -497,6 +461,28 @@ const ChatbotDrawer = () => {
|
||||
transitionProps={{ transition: 'slide-left', duration: 200 }}
|
||||
>
|
||||
<Stack gap="sm" h="100%" style={{ minHeight: 0 }}>
|
||||
<Box
|
||||
p="sm"
|
||||
style={{
|
||||
border: '1px solid var(--border-subtle)',
|
||||
borderRadius: 8,
|
||||
backgroundColor: 'var(--bg-subtle)',
|
||||
display: 'flex',
|
||||
gap: '0.5rem',
|
||||
alignItems: 'flex-start',
|
||||
}}
|
||||
>
|
||||
<WarningAmberRoundedIcon fontSize="small" style={{ color: 'var(--text-warning)' }} />
|
||||
<Box>
|
||||
<Text fw={600}>{t('chatbot.alphaTitle', 'Experimental feature')}</Text>
|
||||
<Text size="sm">
|
||||
{t(
|
||||
'chatbot.alphaDescription',
|
||||
'This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.'
|
||||
)}
|
||||
</Text>
|
||||
</Box>
|
||||
</Box>
|
||||
|
||||
<Group align="flex-end" justify="space-between" gap="md" wrap="wrap">
|
||||
<Select
|
||||
@ -508,7 +494,12 @@ const ChatbotDrawer = () => {
|
||||
nothingFoundMessage={t('chatbot.noFiles', 'Upload a PDF from File Manager to start chatting.')}
|
||||
style={{ flex: '1 1 200px' }}
|
||||
/>
|
||||
<Stack gap={4} style={{ minWidth: 160 }}>
|
||||
<Stack gap={4} style={{ minWidth: 180 }}>
|
||||
<Switch
|
||||
checked={alphaAccepted}
|
||||
onChange={(event) => handleAlphaAccept(event.currentTarget.checked)}
|
||||
label={t('chatbot.acceptAlphaLabel', 'I acknowledge this experimental feature')}
|
||||
/>
|
||||
<Switch
|
||||
checked={runOcr}
|
||||
onChange={(event) => setRunOcr(event.currentTarget.checked)}
|
||||
@ -517,7 +508,20 @@ const ChatbotDrawer = () => {
|
||||
</Stack>
|
||||
</Group>
|
||||
|
||||
{statusMessage && (
|
||||
<Button
|
||||
fullWidth
|
||||
variant="filled"
|
||||
leftSection={<RefreshRoundedIcon fontSize="small" />}
|
||||
loading={isStartingSession || sessionStatus === 'processing'}
|
||||
onClick={() => handleManualPrepare()}
|
||||
disabled={!selectedFile || !alphaAccepted || sessionStatus === 'processing'}
|
||||
>
|
||||
{sessionStatus === 'ready'
|
||||
? t('chatbot.refreshButton', 'Reprocess document')
|
||||
: t('chatbot.startButton', 'Prepare document for chat')}
|
||||
</Button>
|
||||
|
||||
{derivedStatusMessage && (
|
||||
<Box
|
||||
p="sm"
|
||||
style={{
|
||||
@ -526,7 +530,16 @@ const ChatbotDrawer = () => {
|
||||
backgroundColor: 'var(--bg-muted)',
|
||||
}}
|
||||
>
|
||||
<Text size="sm" c="blue">{statusMessage}</Text>
|
||||
<Text
|
||||
size="sm"
|
||||
c={
|
||||
sessionStatus === 'error' || sessionStatus === 'unsupported'
|
||||
? 'var(--text-warning)'
|
||||
: 'blue'
|
||||
}
|
||||
>
|
||||
{derivedStatusMessage}
|
||||
</Text>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
@ -561,28 +574,6 @@ const ChatbotDrawer = () => {
|
||||
</Group>
|
||||
</Box>
|
||||
))}
|
||||
{isOpen && (
|
||||
<Box
|
||||
p="sm"
|
||||
bg="var(--bg-muted)"
|
||||
style={{ borderRadius: 12, border: '1px solid var(--border-subtle)' }}
|
||||
>
|
||||
<Group gap="xs" align="flex-start">
|
||||
<WarningAmberRoundedIcon fontSize="small" style={{ color: 'var(--text-warning)' }} />
|
||||
<Box>
|
||||
<Text size="sm" fw={600}>
|
||||
{t('chatbot.alphaTitle', 'Experimental feature')}
|
||||
</Text>
|
||||
<Text size="sm">
|
||||
{t(
|
||||
'chatbot.alphaDescription',
|
||||
'This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.'
|
||||
)}
|
||||
</Text>
|
||||
</Box>
|
||||
</Group>
|
||||
</Box>
|
||||
)}
|
||||
{messages.length === 0 && (
|
||||
<Text size="sm" c="dimmed">
|
||||
{t('chatbot.emptyState', 'Ask a question about your PDF to start the conversation.')}
|
||||
@ -634,35 +625,6 @@ const ChatbotDrawer = () => {
|
||||
</Stack>
|
||||
</Modal>
|
||||
|
||||
<Modal
|
||||
opened={noTextModalOpen}
|
||||
onClose={() => setNoTextModalOpen(false)}
|
||||
title={t('chatbot.noTextTitle', 'No text detected in this PDF')}
|
||||
centered
|
||||
>
|
||||
<Stack gap="sm">
|
||||
<Text size="sm">
|
||||
{t('chatbot.noTextBody', 'We could not find selectable text in this document. Would you like to run OCR to convert scanned pages into text?')}
|
||||
</Text>
|
||||
<Group justify="flex-end">
|
||||
<Button variant="default" leftSection={<CloseRoundedIcon fontSize="small" />} onClick={() => setNoTextModalOpen(false)}>
|
||||
{t('chatbot.noTextDismiss', 'Maybe later')}
|
||||
</Button>
|
||||
<Button
|
||||
leftSection={<SmartToyRoundedIcon fontSize="small" />}
|
||||
onClick={() => {
|
||||
setNoTextModalOpen(false);
|
||||
setRunOcr(true);
|
||||
if (pendingOcrRetry) {
|
||||
handleSessionStart(true);
|
||||
}
|
||||
}}
|
||||
>
|
||||
{t('chatbot.noTextRunOcr', 'Run OCR and retry')}
|
||||
</Button>
|
||||
</Group>
|
||||
</Stack>
|
||||
</Modal>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
@ -1,4 +1,23 @@
|
||||
import { createContext, useCallback, useContext, useMemo, useState, type ReactNode } from 'react';
|
||||
import {
|
||||
createContext,
|
||||
useCallback,
|
||||
useContext,
|
||||
useEffect,
|
||||
useMemo,
|
||||
useRef,
|
||||
useState,
|
||||
type ReactNode,
|
||||
} from 'react';
|
||||
|
||||
import { useFileState } from '@app/contexts/FileContext';
|
||||
import type { StirlingFile } from '@app/types/fileContext';
|
||||
import { extractTextFromPdf } from '@app/services/pdfTextExtractor';
|
||||
import { extractTextFromDocx } from '@app/services/docxTextExtractor';
|
||||
import {
|
||||
ChatbotSessionInfo,
|
||||
createChatbotSession,
|
||||
} from '@app/services/chatbotService';
|
||||
import { runOcrForChat } from '@app/services/chatbotOcrService';
|
||||
|
||||
type ChatbotSource = 'viewer' | 'tool';
|
||||
|
||||
@ -7,6 +26,25 @@ interface OpenChatOptions {
|
||||
fileId?: string;
|
||||
}
|
||||
|
||||
type PreparationStatus = 'idle' | 'processing' | 'ready' | 'error' | 'unsupported';
|
||||
|
||||
interface PreparedChatbotDocument {
|
||||
documentId: string;
|
||||
fileId: string;
|
||||
fileName: string;
|
||||
status: PreparationStatus;
|
||||
session?: ChatbotSessionInfo;
|
||||
characterCount?: number;
|
||||
pageCount?: number;
|
||||
warnings?: string[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface PreprocessOptions {
|
||||
force?: boolean;
|
||||
forceOcr?: boolean;
|
||||
}
|
||||
|
||||
interface ChatbotContextValue {
|
||||
isOpen: boolean;
|
||||
source: ChatbotSource;
|
||||
@ -14,6 +52,8 @@ interface ChatbotContextValue {
|
||||
openChat: (options?: OpenChatOptions) => void;
|
||||
closeChat: () => void;
|
||||
setPreferredFileId: (fileId?: string) => void;
|
||||
sessions: Record<string, PreparedChatbotDocument>;
|
||||
requestPreprocessing: (fileId: string, options?: PreprocessOptions) => Promise<void>;
|
||||
}
|
||||
|
||||
const ChatbotContext = createContext<ChatbotContextValue | undefined>(undefined);
|
||||
@ -23,6 +63,184 @@ export function ChatbotProvider({ children }: { children: ReactNode }) {
|
||||
const [source, setSource] = useState<ChatbotSource>('viewer');
|
||||
const [preferredFileId, setPreferredFileId] = useState<string | undefined>();
|
||||
|
||||
const { selectors } = useFileState();
|
||||
const [preparedSessions, setPreparedSessions] = useState<
|
||||
Record<string, PreparedChatbotDocument>
|
||||
>({});
|
||||
const sessionsRef = useRef(preparedSessions);
|
||||
sessionsRef.current = preparedSessions;
|
||||
const inFlightRef = useRef<Map<string, Promise<void>>>(new Map());
|
||||
|
||||
const supportedExtensions = useMemo(
|
||||
() => new Set(['pdf', 'doc', 'docx']),
|
||||
[]
|
||||
);
|
||||
|
||||
const getExtension = useCallback((file: StirlingFile) => {
|
||||
const parts = file.name.split('.');
|
||||
return parts.length > 1 ? parts.at(-1)!.toLowerCase() : '';
|
||||
}, []);
|
||||
|
||||
const updateSessionEntry = useCallback((file: StirlingFile, partial: Partial<PreparedChatbotDocument>) => {
|
||||
setPreparedSessions((prev) => ({
|
||||
...prev,
|
||||
[file.fileId]: {
|
||||
...prev[file.fileId],
|
||||
documentId: file.fileId,
|
||||
fileId: file.fileId,
|
||||
fileName: file.name,
|
||||
status: 'idle',
|
||||
...partial,
|
||||
},
|
||||
}));
|
||||
}, []);
|
||||
|
||||
const preprocessFile = useCallback(
|
||||
async (file: StirlingFile, options?: PreprocessOptions) => {
|
||||
const extension = getExtension(file);
|
||||
if (!supportedExtensions.has(extension)) {
|
||||
updateSessionEntry(file, {
|
||||
status: 'unsupported',
|
||||
error: 'Only PDF and Word documents are indexed for chat.',
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (extension === 'doc') {
|
||||
updateSessionEntry(file, {
|
||||
status: 'unsupported',
|
||||
error: 'Legacy Word (.doc) files are not supported yet.',
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
updateSessionEntry(file, {
|
||||
status: 'processing',
|
||||
error: undefined,
|
||||
session: undefined,
|
||||
warnings: undefined,
|
||||
characterCount: undefined,
|
||||
pageCount: undefined,
|
||||
});
|
||||
|
||||
try {
|
||||
let workingFile: File = file;
|
||||
const shouldRunOcr = Boolean(options?.forceOcr && extension === 'pdf');
|
||||
if (shouldRunOcr) {
|
||||
workingFile = await runOcrForChat(file);
|
||||
}
|
||||
let extracted: { text: string; pageCount?: number; characterCount: number };
|
||||
if (extension === 'pdf') {
|
||||
const pdfResult = await extractTextFromPdf(workingFile);
|
||||
extracted = {
|
||||
text: pdfResult.text,
|
||||
pageCount: pdfResult.pageCount,
|
||||
characterCount: pdfResult.characterCount,
|
||||
};
|
||||
} else {
|
||||
const docxResult = await extractTextFromDocx(workingFile);
|
||||
extracted = {
|
||||
text: docxResult.text,
|
||||
pageCount: 0,
|
||||
characterCount: docxResult.characterCount,
|
||||
};
|
||||
}
|
||||
|
||||
if (!extracted.text || extracted.text.trim().length === 0) {
|
||||
throw new Error(
|
||||
'No text detected. Try running OCR from the chat window.'
|
||||
);
|
||||
}
|
||||
|
||||
const metadata: Record<string, string> = {
|
||||
fileName: workingFile.name,
|
||||
fileSize: String(workingFile.size),
|
||||
fileType: workingFile.type || extension,
|
||||
characterCount: String(extracted.characterCount),
|
||||
ocrApplied: shouldRunOcr ? 'true' : 'false',
|
||||
};
|
||||
if (typeof extracted.pageCount === 'number') {
|
||||
metadata.pageCount = String(extracted.pageCount);
|
||||
}
|
||||
|
||||
const session = await createChatbotSession({
|
||||
sessionId: file.fileId,
|
||||
documentId: file.fileId,
|
||||
text: extracted.text,
|
||||
metadata,
|
||||
ocrRequested: shouldRunOcr,
|
||||
warningsAccepted: true,
|
||||
});
|
||||
|
||||
updateSessionEntry(file, {
|
||||
status: 'ready',
|
||||
session,
|
||||
characterCount: extracted.characterCount,
|
||||
pageCount: extracted.pageCount,
|
||||
warnings: session.warnings ?? [],
|
||||
error: undefined,
|
||||
});
|
||||
} catch (error) {
|
||||
const message =
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: 'Failed to prepare document for chatbot.';
|
||||
updateSessionEntry(file, {
|
||||
status: 'error',
|
||||
error: message,
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
[getExtension, supportedExtensions, updateSessionEntry]
|
||||
);
|
||||
|
||||
const requestPreprocessing = useCallback(
|
||||
async (fileId: string, options?: PreprocessOptions) => {
|
||||
const file = selectors.getFile(fileId as any);
|
||||
if (!file) {
|
||||
return;
|
||||
}
|
||||
if (inFlightRef.current.has(fileId) && !options?.force) {
|
||||
return inFlightRef.current.get(fileId);
|
||||
}
|
||||
const promise = preprocessFile(file, options)
|
||||
.finally(() => {
|
||||
inFlightRef.current.delete(fileId);
|
||||
});
|
||||
inFlightRef.current.set(fileId, promise);
|
||||
return promise;
|
||||
},
|
||||
[selectors, preprocessFile]
|
||||
);
|
||||
|
||||
const filesSignature = selectors.getFilesSignature();
|
||||
const availableFiles = useMemo(
|
||||
() => selectors.getFiles(),
|
||||
[filesSignature, selectors]
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
availableFiles.forEach((file) => {
|
||||
if (!supportedExtensions.has(getExtension(file))) {
|
||||
return;
|
||||
}
|
||||
if (!sessionsRef.current[file.fileId]) {
|
||||
requestPreprocessing(file.fileId).catch(() => {});
|
||||
}
|
||||
});
|
||||
|
||||
const currentIds = new Set(availableFiles.map((file) => file.fileId));
|
||||
setPreparedSessions((prev) => {
|
||||
const next = { ...prev };
|
||||
Object.keys(next).forEach((fileId) => {
|
||||
if (!currentIds.has(fileId as any)) {
|
||||
delete next[fileId];
|
||||
}
|
||||
});
|
||||
return next;
|
||||
});
|
||||
}, [availableFiles, getExtension, requestPreprocessing, supportedExtensions]);
|
||||
|
||||
const openChat = useCallback((options: OpenChatOptions = {}) => {
|
||||
if (options.source) {
|
||||
setSource(options.source);
|
||||
@ -45,8 +263,10 @@ export function ChatbotProvider({ children }: { children: ReactNode }) {
|
||||
openChat,
|
||||
closeChat,
|
||||
setPreferredFileId,
|
||||
sessions: preparedSessions,
|
||||
requestPreprocessing,
|
||||
}),
|
||||
[isOpen, source, preferredFileId, openChat, closeChat]
|
||||
[isOpen, source, preferredFileId, openChat, closeChat, preparedSessions, requestPreprocessing]
|
||||
);
|
||||
|
||||
return <ChatbotContext.Provider value={value}>{children}</ChatbotContext.Provider>;
|
||||
|
||||
34
frontend/src/core/services/docxTextExtractor.ts
Normal file
34
frontend/src/core/services/docxTextExtractor.ts
Normal file
@ -0,0 +1,34 @@
|
||||
import JSZip from 'jszip';
|
||||
|
||||
export interface ExtractedDocxText {
|
||||
text: string;
|
||||
characterCount: number;
|
||||
}
|
||||
|
||||
export async function extractTextFromDocx(file: File): Promise<ExtractedDocxText> {
|
||||
const zip = await JSZip.loadAsync(file);
|
||||
const documentXml =
|
||||
(await zip.file('word/document.xml')?.async('string')) ??
|
||||
(await zip.file('word/document2.xml')?.async('string'));
|
||||
|
||||
if (!documentXml) {
|
||||
throw new Error('Docx document.xml missing');
|
||||
}
|
||||
|
||||
const parser = new DOMParser();
|
||||
const xml = parser.parseFromString(documentXml, 'application/xml');
|
||||
const paragraphNodes = [
|
||||
...Array.from(xml.getElementsByTagNameNS('*', 'p')),
|
||||
...Array.from(xml.getElementsByTagName('w:p')),
|
||||
];
|
||||
const text = paragraphNodes
|
||||
.map((p) => (p.textContent || '').replace(/\s+/g, ' ').trim())
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
.trim();
|
||||
|
||||
return {
|
||||
text,
|
||||
characterCount: text.length,
|
||||
};
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user