From ed366059147b878615fc80625d75229c47b8047c Mon Sep 17 00:00:00 2001 From: Siddhant Thakuria Date: Tue, 14 Oct 2025 16:55:15 +0530 Subject: [PATCH] deprecate cpu support --- .gitignore | 4 +- CLAUDE.md | 14 +------ README.md | 35 ++++------------ docstrange/cli.py | 24 +++-------- docstrange/extractor.py | 44 ++++++------------- docstrange/pipeline/model_downloader.py | 2 +- docstrange/static/script.js | 56 +++---------------------- docstrange/utils/gpu_utils.py | 10 ++++- docstrange/web_app.py | 35 ++++++++++++---- 9 files changed, 73 insertions(+), 151 deletions(-) diff --git a/.gitignore b/.gitignore index ed36a94..c216b9c 100644 --- a/.gitignore +++ b/.gitignore @@ -199,4 +199,6 @@ ENV/ .vscode/ .playwright-mcp/ -examples/ \ No newline at end of file +examples/ + +venv1/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 3f0912f..e6cbe36 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,9 +6,8 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co DocStrange is a Python library for extracting and converting documents (PDFs, Word, Excel, PowerPoint, images, URLs) into multiple formats (Markdown, JSON, CSV, HTML) with intelligent content extraction and advanced OCR capabilities. -The library offers three processing modes: +The library offers two processing modes: - **Cloud Mode (default)**: Instant conversion using cloud API -- **CPU Mode**: Local processing for privacy - **GPU Mode**: Local processing with GPU acceleration ## Commands @@ -117,16 +116,10 @@ python -m twine upload dist/* - Authentication: `docstrange login` or API key - Best for: Quick processing without GPU -### CPU Mode -- Force with `cpu=True` parameter -- Uses local neural models -- Requires model downloads (~500MB first run) -- Best for: Privacy-sensitive documents - ### GPU Mode - Force with `gpu=True` parameter - Requires CUDA-compatible GPU -- Faster than CPU for large documents +- Fastest local processing - Best for: Batch processing, high-volume workloads ## Authentication & Rate Limits @@ -215,9 +208,6 @@ structured = result.extract_data(json_schema=schema) ### Force local processing ```python -# CPU mode -extractor = DocumentExtractor(cpu=True) - # GPU mode (requires CUDA) extractor = DocumentExtractor(gpu=True) ``` diff --git a/README.md b/README.md index 980269a..c6ab2b6 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ DocStrange converts documents to Markdown, JSON, CSV, and HTML quickly and accur > Extract documents data instantly with the cloud processing - no complex setup needed > **πŸ”’ Local Processing !** -> Use `cpu` or `gpu` mode for 100% local processing - no data sent anywhere, everything stays on your machine. +> Use `gpu` mode for 100% local processing - no data sent anywhere, everything stays on your machine. ## **What's New** @@ -56,7 +56,7 @@ Convert and extract data from PDF, DOCX, images, and more into clean Markdown an `DocStrange` is a Python library for converting a wide range of document formatsβ€”including **PDF**, **DOCX**, **PPTX**, **XLSX**, and **images** β€” into clean, usable data. It produces LLM-optimized **Markdown**, structured **JSON** (with schema support), **HTML**, and **CSV** outputs, making it an ideal tool for preparing content for RAG pipelines and other AI applications. -The library offers both a powerful cloud API and a 100% private, offline mode that runs locally on your CPU or GPU. Developed by **Nanonets**, DocStrange is built on a powerful pipeline of OCR and layout detection models and currently requires **Python >=3.8**. +The library offers both a powerful cloud API and a 100% private, offline mode that runs locally on your GPU. Developed by **Nanonets**, DocStrange is built on a powerful pipeline of OCR and layout detection models and currently requires **Python >=3.8**. **To report a bug or request a feature, [please file an issue](https://github.com/NanoNets/docstrange/issues). To ask a question or request assistance, please use the [discussions forum](https://github.com/NanoNets/docstrange/discussions).** @@ -185,12 +185,9 @@ print(structured_data) **Local Processing** -For complete privacy and offline capability, run DocStrange entirely on your own machine. You can specify whether to use your CPU or GPU for processing. +For complete privacy and offline capability, run DocStrange entirely on your own machine using GPU processing. ```python -# Force local CPU processing -extractor = DocumentExtractor(cpu=True) - # Force local GPU processing (requires CUDA) extractor = DocumentExtractor(gpu=True) ``` @@ -201,7 +198,7 @@ extractor = DocumentExtractor(gpu=True) πŸ’‘ Want a GUI? Run the simple, drag-and-drop local web interface for private, offline document conversion. -For users who prefer a graphical interface, DocStrange includes a powerful, self-hosted web UI. This allows for easy drag-and-drop conversion of PDF, DOCX, and other files directly in your browser, with 100% private, offline processing on your own CPU or GPU. The interface automatically downloads required models on its first run. +For users who prefer a graphical interface, DocStrange includes a powerful, self-hosted web UI. This allows for easy drag-and-drop conversion of PDF, DOCX, and other files directly in your browser, with 100% private, offline processing on your own GPU. The interface automatically downloads required models on its first run. ### How to get started? @@ -230,9 +227,9 @@ python -c "from docstrange.web_app import run_web_app; run_web_app()" - πŸ–±οΈ Drag & Drop Interface: Simply drag files onto the upload area. - πŸ“ Multiple File Types: Supports PDF, DOCX, XLSX, PPTX, images, and more. -- βš™οΈ Processing Modes: Choose between Local CPU and Local GPU processing. +- βš™οΈ Processing Modes: Choose between Cloud and Local GPU processing. - πŸ“Š Multiple Output Formats: Get Markdown, HTML, JSON, CSV, and Flat JSON. -- πŸ”’ 100% Local Processing: No data leaves your machine. +- πŸ”’ Privacy Options: Choose between cloud processing (default) or local GPU processing. - πŸ“± Responsive Design: Works on desktop, tablet, and mobile ### **Supported File Types:** @@ -245,9 +242,8 @@ python -c "from docstrange.web_app import run_web_app; run_web_app()" ### **Processing Modes:** -- **Local CPU**: Works offline, slower but private (default) +- **Cloud processing:** For instant, zero-setup conversion, you can head over to [docstrange.nanonets.com](http://docstrange.nanonets.com/) **β€”** no setup (default) - **Local GPU**: Fastest local processing, requires CUDA support -- **Cloud processing:** For instant, zero-setup conversion, you can head over to [docstrange.nanonets.com](http://docstrange.nanonets.com/) **β€”** no setup ### **Output Formats:** @@ -295,7 +291,7 @@ docstrange web --port 8001 - The interface automatically detects GPU availability - GPU option will be disabled if CUDA is not available -- CPU mode will be selected automatically +- Error will be thrown 3. Model Download Issues: @@ -367,13 +363,6 @@ csv_data = result.extract_csv() print(csv_data) ``` -**Requirements for enhanced JSON (if using cpu=True):** - -- Install: `pip install 'docstrange[local-llm]'` -- [Install Ollama](https://ollama.ai/) and run: `ollama serve` -- Pull a model: `ollama pull llama3.2` - -*If Ollama is not available, the library automatically falls back to the standard JSON parser.* **c. Extract Specific Fields & Structured Data** @@ -484,11 +473,6 @@ contract_schema = { contract_data = contract.extract_data(json_schema=contract_schema) ``` -**Local extraction requirements (if using cpu=True):** - -- Install ollama package: `pip install 'docstrange[local-llm]'` -- [Install Ollama](https://ollama.ai/) and run: `ollama serve` -- Pull a model: `ollama pull llama3.2` **e. Chain with LLM** @@ -591,7 +575,6 @@ docstrange document.pdf docstrange document.pdf --api-key YOUR_API_KEY # Local processing modes -docstrange document.pdf --cpu-mode docstrange document.pdf --gpu-mode # Different output formats @@ -629,8 +612,6 @@ docstrange document.pdf --output json --extract-fields title author date summary # Or use API key for 10k docs/month access (alternative to login) docstrange document.pdf --api-key YOUR_API_KEY --output json --extract-fields title author date summary -# Force local processing with field extraction (requires Ollama) -docstrange document.pdf --cpu-mode --output json --extract-fields key_points conclusions recommendations ``` **Example schema.json file:** diff --git a/docstrange/cli.py b/docstrange/cli.py index 5780224..dff3271 100644 --- a/docstrange/cli.py +++ b/docstrange/cli.py @@ -182,9 +182,6 @@ def main(): # Convert with free API key with increased limits docstrange document.pdf --api-key YOUR_API_KEY - # Force local CPU processing - docstrange document.pdf --cpu-mode - # Force local GPU processing docstrange document.pdf --gpu-mode @@ -207,10 +204,10 @@ def main(): # Convert multiple files docstrange file1.pdf file2.docx file3.xlsx --output markdown - # Extract specific fields using Ollama (CPU mode only) or cloud + # Extract specific fields using cloud processing docstrange invoice.pdf --output json --extract-fields invoice_number total_amount vendor_name - # Extract using JSON schema (Ollama for CPU mode, cloud for default mode) + # Extract using JSON schema with cloud processing docstrange document.pdf --output json --json-schema schema.json # Save output to file @@ -242,12 +239,6 @@ def main(): ) # Processing mode arguments - parser.add_argument( - "--cpu-mode", - action="store_true", - help="Force local CPU-only processing (disables cloud mode)" - ) - parser.add_argument( "--gpu-mode", action="store_true", @@ -280,12 +271,12 @@ def main(): parser.add_argument( "--extract-fields", nargs="+", - help="Extract specific fields using Ollama (CPU mode) or cloud (default mode) (e.g., --extract-fields invoice_number total_amount)" + help="Extract specific fields using cloud processing (e.g., --extract-fields invoice_number total_amount)" ) parser.add_argument( "--json-schema", - help="JSON schema file for structured extraction using Ollama (CPU mode) or cloud (default mode)" + help="JSON schema file for structured extraction using cloud processing" ) parser.add_argument( @@ -361,7 +352,6 @@ def main(): extractor = DocumentExtractor( api_key=args.api_key, model=args.model, - cpu=args.cpu_mode, gpu=args.gpu_mode ) print_supported_formats(extractor) @@ -404,12 +394,11 @@ def main(): extractor = DocumentExtractor( api_key=args.api_key, model=args.model, - cpu=args.cpu_mode, gpu=args.gpu_mode ) if args.verbose: - mode = "local" if (args.cpu_mode or args.gpu_mode) else "cloud" + mode = "local" if args.gpu_mode else "cloud" print(f"Initialized extractor in {mode} mode:") print(f" - Output format: {args.output}") if mode == "cloud": @@ -418,8 +407,7 @@ def main(): if args.model: print(f" - Model: {args.model}") else: - processor_type = "GPU" if args.gpu_mode else "CPU" - print(f" - Local processing: {processor_type}") + print(f" - Local processing: GPU") print() # Process inputs diff --git a/docstrange/extractor.py b/docstrange/extractor.py index 1587ade..6823d82 100644 --- a/docstrange/extractor.py +++ b/docstrange/extractor.py @@ -34,7 +34,6 @@ def __init__( ocr_enabled: bool = True, api_key: Optional[str] = None, model: Optional[str] = None, - cpu: bool = False, gpu: bool = False ): """Initialize the file extractor. @@ -45,11 +44,10 @@ def __init__( ocr_enabled: Whether to enable OCR for image and PDF processing api_key: API key for cloud processing (optional). Prefer 'docstrange login' for 10k docs/month; API key from https://app.nanonets.com/#/keys is an alternative model: Model to use for cloud processing (gemini, openapi) - only for cloud mode - cpu: Force local CPU-only processing (disables cloud mode) gpu: Force local GPU processing (disables cloud mode, requires GPU) Note: - - Cloud mode is the default unless cpu or gpu is specified + - Cloud mode is the default unless gpu is specified - Without login or API key, limited calls per day - For 10k docs/month, run 'docstrange login' (recommended) or use an API key from https://app.nanonets.com/#/keys """ @@ -57,23 +55,17 @@ def __init__( self.include_images = include_images self.api_key = api_key self.model = model - self.cpu = cpu self.gpu = gpu # Determine processing mode - # Cloud mode is default unless CPU/GPU preference is explicitly set - self.cloud_mode = not (self.cpu or self.gpu) - - # Validate CPU/GPU preferences - if self.cpu and self.gpu: - raise ValueError("Cannot specify both cpu and gpu. Choose one or neither.") + # Cloud mode is default unless GPU preference is explicitly set + self.cloud_mode = not self.gpu # Check GPU availability if GPU preference is set if self.gpu and not should_use_gpu_processor(): raise RuntimeError( "GPU preference specified but no GPU is available. " - "Please ensure CUDA is installed and a compatible GPU is present, " - "or use cpu=True for CPU-only processing." + "Please ensure CUDA is installed and a compatible GPU is present." ) # Default to True if not explicitly set @@ -157,7 +149,7 @@ def authenticate(self, force_reauth: bool = False) -> bool: return False def _setup_local_processors(self): - """Setup local processors based on CPU/GPU preferences.""" + """Setup local processors based on GPU preferences.""" local_processors = [ PDFProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled), DOCXProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images), @@ -169,19 +161,11 @@ def _setup_local_processors(self): URLProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images), ] - # Add GPU processor based on preferences and availability - gpu_available = should_use_gpu_processor() - - if self.cpu: - logger.info("CPU preference specified - using CPU-based processors only") - elif self.gpu: - if gpu_available: - logger.info("GPU preference specified - adding GPU processor with Nanonets OCR") - gpu_processor = GPUProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled) - local_processors.append(gpu_processor) - else: - # This should not happen due to validation in __init__, but just in case - raise RuntimeError("GPU preference specified but no GPU is available") + # Add GPU processor if GPU preference is specified + if self.gpu: + logger.info("GPU preference specified - adding GPU processor with Nanonets OCR") + gpu_processor = GPUProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled) + local_processors.append(gpu_processor) self.processors.extend(local_processors) @@ -312,14 +296,12 @@ def get_processing_mode(self) -> str: """ if self.cloud_mode and self.api_key: return "cloud" - elif self.cpu: - return "cpu_forced" elif self.gpu: return "gpu_forced" elif should_use_gpu_processor(): return "gpu_auto" else: - return "cpu_auto" + return "cloud" def _get_processor(self, file_path: str): """Get the appropriate processor for the file. @@ -340,7 +322,7 @@ def _get_processor(self, file_path: str): gpu_available = should_use_gpu_processor() # Try GPU processor only if format is supported AND (gpu OR auto-gpu) - if not self.cpu and ext in gpu_supported_formats and (self.gpu or (gpu_available and not self.gpu)): + if ext in gpu_supported_formats and (self.gpu or (gpu_available and not self.gpu)): for processor in self.processors: if isinstance(processor, GPUProcessor): if self.gpu: @@ -349,7 +331,7 @@ def _get_processor(self, file_path: str): logger.info(f"Using GPU processor with Nanonets OCR for {file_path} (GPU available and format supported)") return processor - # Fallback to normal processor selection (CPU processors) + # Fallback to normal processor selection for processor in self.processors: if processor.can_process(file_path): # Skip GPU processor in fallback mode to avoid infinite loops diff --git a/docstrange/pipeline/model_downloader.py b/docstrange/pipeline/model_downloader.py index af07292..0ec14d4 100644 --- a/docstrange/pipeline/model_downloader.py +++ b/docstrange/pipeline/model_downloader.py @@ -76,7 +76,7 @@ def download_models(self, force: bool = False, progress: bool = True) -> Path: if gpu_available: logger.info("GPU detected - including Nanonets OCR model") else: - logger.info("No GPU detected - skipping Nanonets OCR model (CPU-only mode)") + logger.info("No GPU detected - skipping Nanonets OCR model (cloud mode)") models_to_download = [ ("Layout Model", self.LAYOUT_MODEL), diff --git a/docstrange/static/script.js b/docstrange/static/script.js index 7a2eb18..5d8404d 100644 --- a/docstrange/static/script.js +++ b/docstrange/static/script.js @@ -25,53 +25,8 @@ class DocStrangeApp { } updateProcessingModeOptions(systemInfo) { - const gpuOption = document.querySelector('input[name="processingMode"][value="gpu"]'); - const gpuLabel = gpuOption?.closest('.radio-option'); - const gpuWarning = document.getElementById('gpuWarning'); - - if (gpuLabel) { - if (!systemInfo.gpu_available) { - // Show warning message - if (gpuWarning) { - gpuWarning.style.display = 'block'; - } - - // Disable GPU option if not available - gpuOption.disabled = true; - gpuLabel.classList.add('disabled'); - - // Update description - const description = gpuLabel.querySelector('.radio-description'); - if (description) { - description.textContent = systemInfo.processing_modes.gpu.description; - description.style.color = '#D02B2B'; - } - - // If GPU was selected, switch to CPU - if (gpuOption.checked) { - const cpuOption = document.querySelector('input[name="processingMode"][value="cpu"]'); - if (cpuOption) { - cpuOption.checked = true; - } - } - } else { - // Hide warning message - if (gpuWarning) { - gpuWarning.style.display = 'none'; - } - - // Enable GPU option if available - gpuOption.disabled = false; - gpuLabel.classList.remove('disabled'); - - // Update description - const description = gpuLabel.querySelector('.radio-description'); - if (description) { - description.textContent = systemInfo.processing_modes.gpu.description; - description.style.color = ''; - } - } - } + // Processing mode is now handled automatically - cloud by default, GPU if available and selected + // No UI changes needed as processing mode selection has been removed } initializeEventListeners() { @@ -211,9 +166,8 @@ class DocStrangeApp { const outputFormat = document.querySelector('input[name="outputFormat"]:checked').value; formData.append('output_format', outputFormat); - // Get selected processing mode - const processingMode = document.querySelector('input[name="processingMode"]:checked').value; - formData.append('processing_mode', processingMode); + // Use cloud processing mode by default + formData.append('processing_mode', 'cloud'); const response = await fetch('/api/extract', { method: 'POST', @@ -229,7 +183,7 @@ class DocStrangeApp { // Handle specific GPU errors if (errorMessage.includes('GPU') && errorMessage.includes('not available')) { - this.showError('GPU mode is not available. Please select CPU mode instead.'); + this.showError('GPU mode is not available. Please install PyTorch with CUDA support or use cloud processing.'); } else { this.showError(errorMessage); } diff --git a/docstrange/utils/gpu_utils.py b/docstrange/utils/gpu_utils.py index 97f8cda..d3e9c42 100644 --- a/docstrange/utils/gpu_utils.py +++ b/docstrange/utils/gpu_utils.py @@ -71,9 +71,15 @@ def get_processor_preference() -> str: """Get the preferred processor type based on system capabilities. Returns: - 'gpu' if GPU is available, 'cpu' otherwise + 'gpu' if GPU is available + + Raises: + RuntimeError: If GPU is not available """ if should_use_gpu_processor(): return 'gpu' else: - return 'cpu' \ No newline at end of file + raise RuntimeError( + "GPU is not available. Please ensure CUDA is installed and a compatible GPU is present, " + "or use cloud processing mode." + ) \ No newline at end of file diff --git a/docstrange/web_app.py b/docstrange/web_app.py index ed04a97..2d42b51 100644 --- a/docstrange/web_app.py +++ b/docstrange/web_app.py @@ -35,9 +35,9 @@ def download_models(): # Download GPU models extractor = DocumentExtractor(gpu=True) else: - print("πŸ’» GPU not available - downloading CPU models only") - # Download CPU models only - extractor = DocumentExtractor(cpu=True) + print("πŸ’» GPU not available - using cloud processing") + # Use cloud processing when GPU is not available + extractor = DocumentExtractor() # Test extraction to trigger model downloads print("πŸ“₯ Downloading models...") @@ -64,10 +64,10 @@ def create_extractor_with_mode(processing_mode): """Create DocumentExtractor with proper error handling for processing mode.""" if processing_mode == 'gpu': if not check_gpu_availability(): - raise ValueError("GPU mode selected but GPU is not available. Please install PyTorch with CUDA support or use CPU mode.") + raise ValueError("GPU mode selected but GPU is not available. Please install PyTorch with CUDA support.") return DocumentExtractor(gpu=True) - else: # cpu mode (default) - return DocumentExtractor(cpu=True) + else: # cloud mode (default) + return DocumentExtractor() # Initialize the document extractor extractor = DocumentExtractor() @@ -182,9 +182,9 @@ def get_system_info(): system_info = { 'gpu_available': gpu_available, 'processing_modes': { - 'cpu': { + 'cloud': { 'available': True, - 'description': 'Process locally using CPU. Works offline, slower but private.' + 'description': 'Process using cloud API. Fast and requires no local setup.' }, 'gpu': { 'available': gpu_available, @@ -197,6 +197,25 @@ def get_system_info(): def run_web_app(host='0.0.0.0', port=8000, debug=False): """Run the web application.""" + # Check GPU availability before starting the server + print("πŸ” Checking GPU availability...") + gpu_available = check_gpu_availability() + + if not gpu_available: + error_msg = ( + "❌ GPU is not available! DocStrange requires GPU for optimal performance.\n" + "Please ensure:\n" + "1. CUDA is installed on your system\n" + "2. PyTorch with CUDA support is installed\n" + "3. A compatible NVIDIA GPU is present\n\n" + "To install PyTorch with CUDA support, run:\n" + "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n\n" + "Alternatively, you can use cloud processing mode by modifying the configuration." + ) + print(error_msg) + raise RuntimeError("GPU is not available. DocStrange requires GPU for optimal performance.") + + print("βœ… GPU detected - proceeding with model download...") print("πŸ”„ Downloading models before starting the web interface...") download_models() print(f"βœ… Starting docstrange web interface at http://{host}:{port}")