Skip to main content
The parse.run() method converts documents into structured JSON with text, tables, and figures. It runs OCR, detects document layout, and returns content organized into chunks optimized for LLM and RAG workflows.

Basic Usage

from pathlib import Path
from reducto import Reducto

client = Reducto()

# Upload and parse
upload = client.upload(file=Path("invoice.pdf"))
result = client.parse.run(input=upload.file_id)

# Access the results
for chunk in result.result.chunks:
    print(chunk.content)
    for block in chunk.blocks:
        print(f"  {block.type} on page {block.bbox.page}")

Method Signature

def parse.run(
    input: str | list[str],
    enhance: dict | None = None,
    formatting: dict | None = None,
    retrieval: dict | None = None,
    settings: dict | None = None,
    spreadsheet: dict | None = None
) -> ParseResponse

Parameters

ParameterTypeRequiredDescription
inputstr | list[str]YesFile ID (reducto://...), URL, or jobid:// reference
enhancedict | NoneNoEnhancement options (agentic mode, figure summaries)
formattingdict | NoneNoOutput formatting (table formats, metadata)
retrievaldict | NoneNoChunking and filtering options
settingsdict | NoneNoProcessing settings (page range, OCR, timeouts)
spreadsheetdict | NoneNoSpreadsheet-specific options

Input Options

The input parameter accepts several formats:
# From upload
result = client.parse.run(input=upload.file_id)

# Public URL
result = client.parse.run(input="https://example.com/doc.pdf")

# Presigned S3 URL
result = client.parse.run(input="https://bucket.s3.amazonaws.com/doc.pdf?X-Amz-...")

# Reprocess previous job
result = client.parse.run(input="jobid://7600c8c5-a52f-49d2-8a7d-d75d1b51e141")

Configuration Examples

Chunking

By default, Parse returns the entire document as one chunk. For RAG applications, use variable chunking:
result = client.parse.run(
    input=upload.file_id,
    retrieval={
        "chunking": {
            "chunk_mode": "variable"  # Options: "disabled", "variable", "page", "section"
        }
    }
)

Table Output Format

Control how tables appear in the output:
result = client.parse.run(
    input=upload.file_id,
    formatting={
        "table_output_format": "html"  # Options: "dynamic", "html", "md", "json", "csv"
    }
)

Agentic Mode

Use LLM to review and correct parsing output:
result = client.parse.run(
    input=upload.file_id,
    enhance={
        "agentic": [
            {"scope": "text"},      # For OCR correction
            {"scope": "table"},     # For table structure fixes
            {"scope": "figure"}     # For chart extraction
        ]
    }
)

Figure Summaries

Generate descriptions for charts and images:
result = client.parse.run(
    input=upload.file_id,
    enhance={
        "summarize_figures": True
    }
)

Page Range

Process only specific pages:
result = client.parse.run(
    input=upload.file_id,
    settings={
        "page_range": {
            "start": 1,
            "end": 10
        }
    }
)

Filter Blocks

Remove specific content types from output:
result = client.parse.run(
    input=upload.file_id,
    retrieval={
        "filter_blocks": ["Header", "Footer", "Page Number"]
    }
)

Return Images

Get image URLs for figures and tables:
result = client.parse.run(
    input=upload.file_id,
    settings={
        "return_images": ["figure", "table"]
    }
)

# Access images from blocks
for chunk in result.result.chunks:
    for block in chunk.blocks:
        if hasattr(block, 'image_url') and block.image_url:
            print(f"{block.type}: {block.image_url}")

Response Structure

The ParseResponse object contains:
result: ParseResponse = client.parse.run(input=upload.file_id)

# Top-level fields
print(result.job_id)          # str: Unique job identifier
print(result.duration)        # float: Processing time in seconds
print(result.studio_link)     # str: Link to view in Studio

# Usage information
print(result.usage.num_pages)  # int: Pages processed
print(result.usage.credits)    # float: Credits consumed

# Result content
if result.result.type == "full":
    chunks = result.result.chunks
    for chunk in chunks:
        print(chunk.content)    # str: Full text content
        print(chunk.embed)      # str: Embedding-optimized content
        print(chunk.blocks)    # list[Block]: Individual elements

Chunks

Each chunk contains:
  • content (str): Full text content formatted as Markdown
  • embed (str): Content optimized for embeddings
  • blocks (list[Block]): Individual elements with positions

Blocks

Each block contains:
  • type (str): Element type (Title, Header, Text, Table, Figure, etc.)
  • content (str): The block’s content
  • bbox (BoundingBox): Position on the page (normalized 0-1 coordinates)
  • confidence (str): Confidence level ("high" or "low")

URL Results

For large documents, the response may return a URL instead of inline content:
result = client.parse.run(input=upload.file_id)

if result.result.type == "url":
    # Fetch the content from the URL
    import requests
    chunks = requests.get(result.result.url).json()
else:
    # Content is inline
    chunks = result.result.chunks
To always get a URL (for consistent handling):
result = client.parse.run(
    input=upload.file_id,
    settings={
        "force_url_result": True
    }
)

Advanced Features

Raw Response Access

Access raw HTTP response data:
response = client.parse.with_raw_response.run(input=upload.file_id)
print(response.headers.get('X-My-Header'))
parse_result = response.parse()  # Get the parsed object

Streaming Response

Stream large responses:
with client.parse.with_streaming_response.run(input=upload.file_id) as response:
    for line in response.iter_lines():
        print(line)

Per-Request Options

Override client settings for this request:
# Custom timeout
client.with_options(timeout=30.0).parse.run(input=upload.file_id)

# Custom retry settings
client.with_options(max_retries=5).parse.run(input=upload.file_id)

Error Handling

from reducto import Reducto
import reducto

try:
    result = client.parse.run(input=upload.file_id)
except reducto.APIConnectionError as e:
    print(f"Connection failed: {e}")
    print(e.__cause__)  # underlying exception
except reducto.RateLimitError as e:
    print(f"Rate limited: {e}")
except reducto.APIStatusError as e:
    print(f"API error: {e.status_code} - {e.response}")
except reducto.APITimeoutError as e:
    print(f"Request timed out: {e}")

Complete Example

from pathlib import Path
from reducto import Reducto

client = Reducto()

# Upload
upload = client.upload(file=Path("financial-statement.pdf"))

# Parse with configuration
result = client.parse.run(
    input=upload.file_id,
    enhance={
        "agentic": [{"scope": "table"}],
        "summarize_figures": True
    },
    formatting={
        "table_output_format": "html"
    },
    retrieval={
        "chunking": {"chunk_mode": "variable"}
    },
    settings={
        "page_range": {"start": 1, "end": 5}
    }
)

# Process results
print(f"Processed {result.usage.num_pages} pages")
print(f"Used {result.usage.credits} credits")
print(f"View in Studio: {result.studio_link}")

for i, chunk in enumerate(result.result.chunks):
    print(f"\n=== Chunk {i + 1} ===")
    print(chunk.content[:500])  # First 500 chars
    
    # Count block types
    block_types = {}
    for block in chunk.blocks:
        block_types[block.type] = block_types.get(block.type, 0) + 1
    
    print(f"Block types: {block_types}")

Best Practices

Use Variable Chunking for RAG

Enable chunk_mode: "variable" for RAG pipelines to get semantically meaningful chunks.

Enable Agentic for Scanned Docs

Use agentic: [{"scope": "text"}] for scanned documents or poor-quality PDFs.

Filter Headers/Footers

Use filter_blocks to remove headers and footers that pollute search results.

Handle URL Results

Always check result.type and handle URL results for large documents.

Next Steps