pre-process docs on upload

This commit is contained in:
DarioGii 2025-11-15 20:07:31 +00:00
parent 9ffbede49a
commit e5212f7f63
6 changed files with 443 additions and 208 deletions

View File

@ -45,21 +45,7 @@ public class ChatbotController {
@RequestBody ChatbotSessionCreateRequest request) {
ChatbotSession session = chatbotService.createSession(request);
ChatbotSettings settings = featureProperties.current();
ChatbotSessionResponse response =
ChatbotSessionResponse.builder()
.sessionId(session.getSessionId())
.documentId(session.getDocumentId())
.alphaWarning(settings.alphaWarning())
.ocrRequested(session.isOcrRequested())
.imageContentDetected(session.isImageContentDetected())
.textCharacters(session.getTextCharacters())
.estimatedTokens(session.getEstimatedTokens())
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
.createdAt(session.getCreatedAt())
.warnings(sessionWarnings(settings, session))
.metadata(session.getMetadata())
.usageSummary(session.getUsageSummary())
.build();
ChatbotSessionResponse response = toResponse(session, settings);
return ResponseEntity.status(HttpStatus.CREATED).body(response);
}
@ -76,24 +62,21 @@ public class ChatbotController {
sessionRegistry
.findById(sessionId)
.orElseThrow(() -> new ChatbotException("Session not found"));
ChatbotSessionResponse response =
ChatbotSessionResponse.builder()
.sessionId(session.getSessionId())
.documentId(session.getDocumentId())
.alphaWarning(settings.alphaWarning())
.ocrRequested(session.isOcrRequested())
.imageContentDetected(session.isImageContentDetected())
.textCharacters(session.getTextCharacters())
.estimatedTokens(session.getEstimatedTokens())
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
.createdAt(session.getCreatedAt())
.warnings(sessionWarnings(settings, session))
.metadata(session.getMetadata())
.usageSummary(session.getUsageSummary())
.build();
ChatbotSessionResponse response = toResponse(session, settings);
return ResponseEntity.ok(response);
}
@GetMapping("/document/{documentId}")
public ResponseEntity<ChatbotSessionResponse> getSessionByDocument(
@PathVariable String documentId) {
ChatbotSettings settings = featureProperties.current();
ChatbotSession session =
sessionRegistry
.findByDocumentId(documentId)
.orElseThrow(() -> new ChatbotException("Session not found"));
return ResponseEntity.ok(toResponse(session, settings));
}
@DeleteMapping("/session/{sessionId}")
public ResponseEntity<Void> closeSession(@PathVariable String sessionId) {
chatbotService.close(sessionId);
@ -123,4 +106,21 @@ public class ChatbotController {
return warnings;
}
private ChatbotSessionResponse toResponse(ChatbotSession session, ChatbotSettings settings) {
return ChatbotSessionResponse.builder()
.sessionId(session.getSessionId())
.documentId(session.getDocumentId())
.alphaWarning(settings.alphaWarning())
.ocrRequested(session.isOcrRequested())
.imageContentDetected(session.isImageContentDetected())
.textCharacters(session.getTextCharacters())
.estimatedTokens(session.getEstimatedTokens())
.maxCachedCharacters(cacheService.getMaxDocumentCharacters())
.createdAt(session.getCreatedAt())
.warnings(sessionWarnings(settings, session))
.metadata(session.getMetadata())
.usageSummary(session.getUsageSummary())
.build();
}
}

View File

@ -12,9 +12,13 @@ import stirling.software.proprietary.model.chatbot.ChatbotSession;
public class ChatbotSessionRegistry {
private final Map<String, ChatbotSession> sessionStore = new ConcurrentHashMap<>();
private final Map<String, String> documentToSession = new ConcurrentHashMap<>();
public void register(ChatbotSession session) {
sessionStore.put(session.getSessionId(), session);
if (session.getDocumentId() != null) {
documentToSession.put(session.getDocumentId(), session.getSessionId());
}
}
public Optional<ChatbotSession> findById(String sessionId) {
@ -22,6 +26,16 @@ public class ChatbotSessionRegistry {
}
public void remove(String sessionId) {
sessionStore.remove(sessionId);
Optional.ofNullable(sessionStore.remove(sessionId))
.map(ChatbotSession::getDocumentId)
.ifPresent(documentToSession::remove);
}
public Optional<ChatbotSession> findByDocumentId(String documentId) {
return Optional.ofNullable(documentToSession.get(documentId)).flatMap(this::findById);
}
public void removeByDocumentId(String documentId) {
Optional.ofNullable(documentToSession.remove(documentId)).ifPresent(sessionStore::remove);
}
}

View File

@ -5145,14 +5145,15 @@
"title": "Stirling PDF Bot",
"alphaBadge": "Alpha",
"alphaTitle": "Experimental feature",
"alphaDescription": "Chatbot is in currently in alpha and is subject to change. Responses may be imperfect, please check responses.",
"alphaDescription": "This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.",
"acceptAlphaLabel": "I understand this feature is experimental and image content is not supported yet.",
"fileLabel": "Document to query",
"filePlaceholder": "Select an uploaded PDF",
"noFiles": "Upload a PDF from File Manager to start chatting.",
"ocrToggle": "Run OCR before extracting text (uses more resources)",
"ocrHint": "Enable when your PDF is a scan or contains images.",
"refreshButton": "Re-sync document",
"startButton": "Send document to chat",
"refreshButton": "Reprocess document",
"startButton": "Prepare document for chat",
"sessionSummary": "Context summary",
"contextDetails": "{{pages}} pages · {{chars}} characters synced",
"conversationTitle": "Conversation",
@ -5172,6 +5173,10 @@
"toolHint": "The chat window slides in from the left. If it is already open, this button simply focuses it and passes along the currently selected PDF.",
"toolTitleMenu": "Chatbot (Alpha)",
"toolMenuDescription": "Chat with Stirling Bot about the contents of your PDF.",
"errors": {
"preprocessing": "Unable to prepare this document.",
"unsupported": "Unsupported document type."
},
"status": {
"runningOcr": "Running OCR and extracting text…",
"extracting": "Extracting text from PDF…",

View File

@ -18,17 +18,14 @@ import { useTranslation } from 'react-i18next';
import SmartToyRoundedIcon from '@mui/icons-material/SmartToyRounded';
import WarningAmberRoundedIcon from '@mui/icons-material/WarningAmberRounded';
import SendRoundedIcon from '@mui/icons-material/SendRounded';
import CloseRoundedIcon from '@mui/icons-material/CloseRounded';
import RefreshRoundedIcon from '@mui/icons-material/RefreshRounded';
import { useChatbot } from '@app/contexts/ChatbotContext';
import { useFileState } from '@app/contexts/FileContext';
import { extractTextFromPdf } from '@app/services/pdfTextExtractor';
import { runOcrForChat } from '@app/services/chatbotOcrService';
import {
ChatbotMessageResponse,
ChatbotSessionInfo,
ChatbotUsageSummary,
createChatbotSession,
sendChatbotPrompt,
} from '@app/services/chatbotService';
import { useToast } from '@app/components/toast';
@ -52,28 +49,32 @@ function createMessageId() {
}
const MAX_PROMPT_CHARS = 4000;
const ALPHA_ACK_KEY = 'stirling.chatbot.alphaAck';
const ChatbotDrawer = () => {
const { t } = useTranslation();
const isMobile = useMediaQuery('(max-width: 768px)');
const { width: viewportWidth, height: viewportHeight } = useViewportSize();
const { isOpen, closeChat, preferredFileId, setPreferredFileId } = useChatbot();
const {
isOpen,
closeChat,
preferredFileId,
setPreferredFileId,
sessions: preparedSessions,
requestPreprocessing,
} = useChatbot();
const { selectors } = useFileState();
const { sidebarRefs } = useSidebarContext();
const { show } = useToast();
const files = selectors.getFiles();
const [selectedFileId, setSelectedFileId] = useState<string | undefined>();
const [alphaAccepted, setAlphaAccepted] = useState(false);
const [runOcr, setRunOcr] = useState(false);
const [isStartingSession, setIsStartingSession] = useState(false);
const [isSendingMessage, setIsSendingMessage] = useState(false);
const [statusMessage, setStatusMessage] = useState<string>('');
const [sessionInfo, setSessionInfo] = useState<ChatbotSessionInfo | null>(null);
const [contextStats, setContextStats] = useState<{ pageCount: number; characterCount: number } | null>(null);
const [messages, setMessages] = useState<ChatMessage[]>([]);
const [prompt, setPrompt] = useState('');
const [warnings, setWarnings] = useState<string[]>([]);
const [noTextModalOpen, setNoTextModalOpen] = useState(false);
const [pendingOcrRetry, setPendingOcrRetry] = useState(false);
const scrollViewportRef = useRef<HTMLDivElement>(null);
const [panelAnchor, setPanelAnchor] = useState<{ right: number; top: number } | null>(null);
const usageAlertState = useRef<'none' | 'warned' | 'limit'>('none');
@ -82,6 +83,50 @@ const ChatbotDrawer = () => {
() => files.find((file) => file.fileId === selectedFileId),
[files, selectedFileId]
);
const selectedSessionEntry = selectedFileId
? preparedSessions[selectedFileId]
: undefined;
const sessionStatus = selectedSessionEntry?.status ?? 'idle';
const sessionError = selectedSessionEntry?.error;
const sessionInfo: ChatbotSessionInfo | null = selectedSessionEntry?.session ?? null;
const contextStats =
selectedSessionEntry?.status === 'ready' && selectedSessionEntry?.characterCount !== undefined
? {
pageCount: selectedSessionEntry.pageCount ?? 0,
characterCount: selectedSessionEntry.characterCount ?? 0,
}
: null;
const preparationWarnings = selectedSessionEntry?.warnings ?? [];
const derivedStatusMessage = useMemo(() => {
if (!alphaAccepted) {
return t('chatbot.autoSyncPrompt', 'Acknowledge the alpha notice to start syncing automatically.');
}
if (sessionStatus === 'processing' || isStartingSession) {
return t('chatbot.status.syncing', 'Preparing document for chat…');
}
if (sessionStatus === 'error') {
return sessionError || t('chatbot.errors.preprocessing', 'Unable to prepare this document.');
}
if (sessionStatus === 'unsupported') {
return sessionError || t('chatbot.errors.unsupported', 'Unsupported document type.');
}
return null;
}, [alphaAccepted, sessionStatus, sessionError, isStartingSession, t]);
const assistantWarnings = useMemo(
() => [...preparationWarnings, ...warnings.filter(Boolean)],
[preparationWarnings, warnings]
);
useEffect(() => {
if (!isOpen) {
return;
}
const storedAck =
typeof window !== 'undefined'
? window.localStorage.getItem(ALPHA_ACK_KEY) === 'true'
: false;
setAlphaAccepted(storedAck);
}, [isOpen]);
useEffect(() => {
if (!isOpen) {
@ -113,8 +158,16 @@ const ChatbotDrawer = () => {
useEffect(() => {
usageAlertState.current = 'none';
if (sessionInfo) {
maybeShowUsageWarning(sessionInfo.usageSummary);
}
}, [sessionInfo?.sessionId]);
useEffect(() => {
setMessages([]);
setWarnings([]);
}, [selectedFileId]);
const maybeShowUsageWarning = (usage?: ChatbotUsageSummary | null) => {
if (!usage) {
return;
@ -144,17 +197,6 @@ const ChatbotDrawer = () => {
}
};
useEffect(() => {
if (sessionInfo && sessionInfo.documentId !== selectedFileId) {
setSessionInfo(null);
setContextStats(null);
setMessages([]);
setWarnings([]);
setPendingOcrRetry(false);
setNoTextModalOpen(false);
}
}, [sessionInfo, selectedFileId]);
useLayoutEffect(() => {
if (isMobile || !isOpen) {
setPanelAnchor(null);
@ -183,15 +225,6 @@ const ChatbotDrawer = () => {
};
}, [isMobile, isOpen, sidebarRefs.toolPanelRef]);
const withStatus = async <T,>(label: string, fn: () => Promise<T>): Promise<T> => {
setStatusMessage(label);
try {
return await fn();
} finally {
setStatusMessage('');
}
};
const ensureFileSelected = () => {
if (!selectedFile) {
show({
@ -204,65 +237,27 @@ const ChatbotDrawer = () => {
return true;
};
const handleSessionStart = async (forceOcr?: boolean) => {
if (!ensureFileSelected() || !selectedFile) {
const handleAlphaAccept = (checked: boolean) => {
setAlphaAccepted(checked);
if (typeof window !== 'undefined') {
if (checked) {
window.localStorage.setItem(ALPHA_ACK_KEY, 'true');
} else {
window.localStorage.removeItem(ALPHA_ACK_KEY);
}
}
};
const handleManualPrepare = async (forceOcr?: boolean) => {
if (!ensureFileSelected() || !selectedFileId) {
return;
}
setIsStartingSession(true);
try {
let workingFile: File = selectedFile;
const shouldRunOcr = forceOcr ?? runOcr;
const extractionResult = await withStatus(
shouldRunOcr
? t('chatbot.status.runningOcr', 'Running OCR and extracting text…')
: t('chatbot.status.extracting', 'Extracting text from PDF…'),
async () => {
if (shouldRunOcr) {
workingFile = await runOcrForChat(selectedFile);
}
return extractTextFromPdf(workingFile);
}
);
if (!extractionResult.text || extractionResult.text.trim().length === 0) {
setPendingOcrRetry(true);
setNoTextModalOpen(true);
return;
}
const metadata = {
name: workingFile.name,
size: String(workingFile.size),
pageCount: String(extractionResult.pageCount),
};
const sessionPayload = {
sessionId: sessionInfo?.sessionId,
documentId: selectedFile.fileId,
text: extractionResult.text,
metadata,
ocrRequested: shouldRunOcr,
warningsAccepted: true,
};
const response = await withStatus(
t('chatbot.status.syncing', 'Syncing document with Stirling Bot…'),
() => createChatbotSession(sessionPayload)
);
setSessionInfo(response);
maybeShowUsageWarning(response.usageSummary);
setContextStats({
pageCount: extractionResult.pageCount,
characterCount: extractionResult.characterCount,
});
setMessages([]);
setWarnings(response.warnings ?? []);
setPendingOcrRetry(false);
setNoTextModalOpen(false);
await requestPreprocessing(selectedFileId, { force: true, forceOcr: forceOcr ?? runOcr });
usageAlertState.current = 'none';
} catch (error) {
console.error('[Chatbot] Failed to start session', error);
console.error('[Chatbot] Failed to prepare document', error);
show({
alertType: 'error',
title: t('chatbot.toasts.failedSessionTitle', 'Could not prepare document'),
@ -270,42 +265,11 @@ const ChatbotDrawer = () => {
});
} finally {
setIsStartingSession(false);
setStatusMessage('');
}
};
useEffect(() => {
if (
!isOpen ||
!selectedFile ||
sessionInfo ||
isStartingSession ||
pendingOcrRetry ||
noTextModalOpen
) {
return;
}
let cancelled = false;
handleSessionStart().catch((error) => {
if (!cancelled) {
console.error('[Chatbot] Auto-sync failed', error);
}
});
return () => {
cancelled = true;
};
}, [isOpen, selectedFile, sessionInfo, isStartingSession, pendingOcrRetry, noTextModalOpen, runOcr]);
useEffect(() => {
if (!sessionInfo) {
return;
}
setSessionInfo(null);
setContextStats(null);
}, [runOcr]);
const handleSendMessage = async () => {
if (!sessionInfo) {
if (!sessionInfo || sessionStatus !== 'ready') {
show({
alertType: 'neutral',
title: t('chatbot.toasts.noSessionTitle', 'Sync your document first'),
@ -369,7 +333,8 @@ const ChatbotDrawer = () => {
[files]
);
const disablePromptInput = !sessionInfo || isStartingSession || isSendingMessage;
const disablePromptInput =
!sessionInfo || sessionStatus !== 'ready' || isStartingSession || isSendingMessage;
const canSend = !disablePromptInput && prompt.trim().length > 0;
const handlePromptKeyDown = (event: KeyboardEvent<HTMLTextAreaElement>) => {
@ -395,7 +360,6 @@ const ChatbotDrawer = () => {
</Group>
);
const assistantWarnings = warnings.filter(Boolean);
const safeViewportWidth =
viewportWidth || (typeof window !== 'undefined' ? window.innerWidth : 1280);
@ -497,6 +461,28 @@ const ChatbotDrawer = () => {
transitionProps={{ transition: 'slide-left', duration: 200 }}
>
<Stack gap="sm" h="100%" style={{ minHeight: 0 }}>
<Box
p="sm"
style={{
border: '1px solid var(--border-subtle)',
borderRadius: 8,
backgroundColor: 'var(--bg-subtle)',
display: 'flex',
gap: '0.5rem',
alignItems: 'flex-start',
}}
>
<WarningAmberRoundedIcon fontSize="small" style={{ color: 'var(--text-warning)' }} />
<Box>
<Text fw={600}>{t('chatbot.alphaTitle', 'Experimental feature')}</Text>
<Text size="sm">
{t(
'chatbot.alphaDescription',
'This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.'
)}
</Text>
</Box>
</Box>
<Group align="flex-end" justify="space-between" gap="md" wrap="wrap">
<Select
@ -508,7 +494,12 @@ const ChatbotDrawer = () => {
nothingFoundMessage={t('chatbot.noFiles', 'Upload a PDF from File Manager to start chatting.')}
style={{ flex: '1 1 200px' }}
/>
<Stack gap={4} style={{ minWidth: 160 }}>
<Stack gap={4} style={{ minWidth: 180 }}>
<Switch
checked={alphaAccepted}
onChange={(event) => handleAlphaAccept(event.currentTarget.checked)}
label={t('chatbot.acceptAlphaLabel', 'I acknowledge this experimental feature')}
/>
<Switch
checked={runOcr}
onChange={(event) => setRunOcr(event.currentTarget.checked)}
@ -517,7 +508,20 @@ const ChatbotDrawer = () => {
</Stack>
</Group>
{statusMessage && (
<Button
fullWidth
variant="filled"
leftSection={<RefreshRoundedIcon fontSize="small" />}
loading={isStartingSession || sessionStatus === 'processing'}
onClick={() => handleManualPrepare()}
disabled={!selectedFile || !alphaAccepted || sessionStatus === 'processing'}
>
{sessionStatus === 'ready'
? t('chatbot.refreshButton', 'Reprocess document')
: t('chatbot.startButton', 'Prepare document for chat')}
</Button>
{derivedStatusMessage && (
<Box
p="sm"
style={{
@ -526,7 +530,16 @@ const ChatbotDrawer = () => {
backgroundColor: 'var(--bg-muted)',
}}
>
<Text size="sm" c="blue">{statusMessage}</Text>
<Text
size="sm"
c={
sessionStatus === 'error' || sessionStatus === 'unsupported'
? 'var(--text-warning)'
: 'blue'
}
>
{derivedStatusMessage}
</Text>
</Box>
)}
@ -561,28 +574,6 @@ const ChatbotDrawer = () => {
</Group>
</Box>
))}
{isOpen && (
<Box
p="sm"
bg="var(--bg-muted)"
style={{ borderRadius: 12, border: '1px solid var(--border-subtle)' }}
>
<Group gap="xs" align="flex-start">
<WarningAmberRoundedIcon fontSize="small" style={{ color: 'var(--text-warning)' }} />
<Box>
<Text size="sm" fw={600}>
{t('chatbot.alphaTitle', 'Experimental feature')}
</Text>
<Text size="sm">
{t(
'chatbot.alphaDescription',
'This chatbot is in alpha. It currently ignores images and may produce inaccurate answers.'
)}
</Text>
</Box>
</Group>
</Box>
)}
{messages.length === 0 && (
<Text size="sm" c="dimmed">
{t('chatbot.emptyState', 'Ask a question about your PDF to start the conversation.')}
@ -634,35 +625,6 @@ const ChatbotDrawer = () => {
</Stack>
</Modal>
<Modal
opened={noTextModalOpen}
onClose={() => setNoTextModalOpen(false)}
title={t('chatbot.noTextTitle', 'No text detected in this PDF')}
centered
>
<Stack gap="sm">
<Text size="sm">
{t('chatbot.noTextBody', 'We could not find selectable text in this document. Would you like to run OCR to convert scanned pages into text?')}
</Text>
<Group justify="flex-end">
<Button variant="default" leftSection={<CloseRoundedIcon fontSize="small" />} onClick={() => setNoTextModalOpen(false)}>
{t('chatbot.noTextDismiss', 'Maybe later')}
</Button>
<Button
leftSection={<SmartToyRoundedIcon fontSize="small" />}
onClick={() => {
setNoTextModalOpen(false);
setRunOcr(true);
if (pendingOcrRetry) {
handleSessionStart(true);
}
}}
>
{t('chatbot.noTextRunOcr', 'Run OCR and retry')}
</Button>
</Group>
</Stack>
</Modal>
</>
);
};

View File

@ -1,4 +1,23 @@
import { createContext, useCallback, useContext, useMemo, useState, type ReactNode } from 'react';
import {
createContext,
useCallback,
useContext,
useEffect,
useMemo,
useRef,
useState,
type ReactNode,
} from 'react';
import { useFileState } from '@app/contexts/FileContext';
import type { StirlingFile } from '@app/types/fileContext';
import { extractTextFromPdf } from '@app/services/pdfTextExtractor';
import { extractTextFromDocx } from '@app/services/docxTextExtractor';
import {
ChatbotSessionInfo,
createChatbotSession,
} from '@app/services/chatbotService';
import { runOcrForChat } from '@app/services/chatbotOcrService';
type ChatbotSource = 'viewer' | 'tool';
@ -7,6 +26,25 @@ interface OpenChatOptions {
fileId?: string;
}
type PreparationStatus = 'idle' | 'processing' | 'ready' | 'error' | 'unsupported';
interface PreparedChatbotDocument {
documentId: string;
fileId: string;
fileName: string;
status: PreparationStatus;
session?: ChatbotSessionInfo;
characterCount?: number;
pageCount?: number;
warnings?: string[];
error?: string;
}
interface PreprocessOptions {
force?: boolean;
forceOcr?: boolean;
}
interface ChatbotContextValue {
isOpen: boolean;
source: ChatbotSource;
@ -14,6 +52,8 @@ interface ChatbotContextValue {
openChat: (options?: OpenChatOptions) => void;
closeChat: () => void;
setPreferredFileId: (fileId?: string) => void;
sessions: Record<string, PreparedChatbotDocument>;
requestPreprocessing: (fileId: string, options?: PreprocessOptions) => Promise<void>;
}
const ChatbotContext = createContext<ChatbotContextValue | undefined>(undefined);
@ -23,6 +63,184 @@ export function ChatbotProvider({ children }: { children: ReactNode }) {
const [source, setSource] = useState<ChatbotSource>('viewer');
const [preferredFileId, setPreferredFileId] = useState<string | undefined>();
const { selectors } = useFileState();
const [preparedSessions, setPreparedSessions] = useState<
Record<string, PreparedChatbotDocument>
>({});
const sessionsRef = useRef(preparedSessions);
sessionsRef.current = preparedSessions;
const inFlightRef = useRef<Map<string, Promise<void>>>(new Map());
const supportedExtensions = useMemo(
() => new Set(['pdf', 'doc', 'docx']),
[]
);
const getExtension = useCallback((file: StirlingFile) => {
const parts = file.name.split('.');
return parts.length > 1 ? parts.at(-1)!.toLowerCase() : '';
}, []);
const updateSessionEntry = useCallback((file: StirlingFile, partial: Partial<PreparedChatbotDocument>) => {
setPreparedSessions((prev) => ({
...prev,
[file.fileId]: {
...prev[file.fileId],
documentId: file.fileId,
fileId: file.fileId,
fileName: file.name,
status: 'idle',
...partial,
},
}));
}, []);
const preprocessFile = useCallback(
async (file: StirlingFile, options?: PreprocessOptions) => {
const extension = getExtension(file);
if (!supportedExtensions.has(extension)) {
updateSessionEntry(file, {
status: 'unsupported',
error: 'Only PDF and Word documents are indexed for chat.',
});
return;
}
if (extension === 'doc') {
updateSessionEntry(file, {
status: 'unsupported',
error: 'Legacy Word (.doc) files are not supported yet.',
});
return;
}
updateSessionEntry(file, {
status: 'processing',
error: undefined,
session: undefined,
warnings: undefined,
characterCount: undefined,
pageCount: undefined,
});
try {
let workingFile: File = file;
const shouldRunOcr = Boolean(options?.forceOcr && extension === 'pdf');
if (shouldRunOcr) {
workingFile = await runOcrForChat(file);
}
let extracted: { text: string; pageCount?: number; characterCount: number };
if (extension === 'pdf') {
const pdfResult = await extractTextFromPdf(workingFile);
extracted = {
text: pdfResult.text,
pageCount: pdfResult.pageCount,
characterCount: pdfResult.characterCount,
};
} else {
const docxResult = await extractTextFromDocx(workingFile);
extracted = {
text: docxResult.text,
pageCount: 0,
characterCount: docxResult.characterCount,
};
}
if (!extracted.text || extracted.text.trim().length === 0) {
throw new Error(
'No text detected. Try running OCR from the chat window.'
);
}
const metadata: Record<string, string> = {
fileName: workingFile.name,
fileSize: String(workingFile.size),
fileType: workingFile.type || extension,
characterCount: String(extracted.characterCount),
ocrApplied: shouldRunOcr ? 'true' : 'false',
};
if (typeof extracted.pageCount === 'number') {
metadata.pageCount = String(extracted.pageCount);
}
const session = await createChatbotSession({
sessionId: file.fileId,
documentId: file.fileId,
text: extracted.text,
metadata,
ocrRequested: shouldRunOcr,
warningsAccepted: true,
});
updateSessionEntry(file, {
status: 'ready',
session,
characterCount: extracted.characterCount,
pageCount: extracted.pageCount,
warnings: session.warnings ?? [],
error: undefined,
});
} catch (error) {
const message =
error instanceof Error
? error.message
: 'Failed to prepare document for chatbot.';
updateSessionEntry(file, {
status: 'error',
error: message,
});
throw error;
}
},
[getExtension, supportedExtensions, updateSessionEntry]
);
const requestPreprocessing = useCallback(
async (fileId: string, options?: PreprocessOptions) => {
const file = selectors.getFile(fileId as any);
if (!file) {
return;
}
if (inFlightRef.current.has(fileId) && !options?.force) {
return inFlightRef.current.get(fileId);
}
const promise = preprocessFile(file, options)
.finally(() => {
inFlightRef.current.delete(fileId);
});
inFlightRef.current.set(fileId, promise);
return promise;
},
[selectors, preprocessFile]
);
const filesSignature = selectors.getFilesSignature();
const availableFiles = useMemo(
() => selectors.getFiles(),
[filesSignature, selectors]
);
useEffect(() => {
availableFiles.forEach((file) => {
if (!supportedExtensions.has(getExtension(file))) {
return;
}
if (!sessionsRef.current[file.fileId]) {
requestPreprocessing(file.fileId).catch(() => {});
}
});
const currentIds = new Set(availableFiles.map((file) => file.fileId));
setPreparedSessions((prev) => {
const next = { ...prev };
Object.keys(next).forEach((fileId) => {
if (!currentIds.has(fileId as any)) {
delete next[fileId];
}
});
return next;
});
}, [availableFiles, getExtension, requestPreprocessing, supportedExtensions]);
const openChat = useCallback((options: OpenChatOptions = {}) => {
if (options.source) {
setSource(options.source);
@ -45,8 +263,10 @@ export function ChatbotProvider({ children }: { children: ReactNode }) {
openChat,
closeChat,
setPreferredFileId,
sessions: preparedSessions,
requestPreprocessing,
}),
[isOpen, source, preferredFileId, openChat, closeChat]
[isOpen, source, preferredFileId, openChat, closeChat, preparedSessions, requestPreprocessing]
);
return <ChatbotContext.Provider value={value}>{children}</ChatbotContext.Provider>;

View File

@ -0,0 +1,34 @@
import JSZip from 'jszip';
export interface ExtractedDocxText {
text: string;
characterCount: number;
}
export async function extractTextFromDocx(file: File): Promise<ExtractedDocxText> {
const zip = await JSZip.loadAsync(file);
const documentXml =
(await zip.file('word/document.xml')?.async('string')) ??
(await zip.file('word/document2.xml')?.async('string'));
if (!documentXml) {
throw new Error('Docx document.xml missing');
}
const parser = new DOMParser();
const xml = parser.parseFromString(documentXml, 'application/xml');
const paragraphNodes = [
...Array.from(xml.getElementsByTagNameNS('*', 'p')),
...Array.from(xml.getElementsByTagName('w:p')),
];
const text = paragraphNodes
.map((p) => (p.textContent || '').replace(/\s+/g, ' ').trim())
.filter(Boolean)
.join('\n')
.trim();
return {
text,
characterCount: text.length,
};
}