🚀 Our new and improved config V3 is now live! See API reference for details.
import requests
url = "https://platform.reducto.ai/parse"
payload = {
"document_url": "<string>",
"options": {
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": { "chunk_mode": "variable" },
"table_summary": { "enabled": False },
"figure_summary": {
"enabled": False,
"enhanced": False,
"override": False
},
"filter_blocks": [],
"force_url_result": False
},
"advanced_options": {
"ocr_system": "highres",
"table_output_format": "html",
"merge_tables": False,
"include_formula_information": False,
"include_color_information": False,
"continue_hierarchy": True,
"keep_line_breaks": False,
"page_range": {},
"large_table_chunking": {
"enabled": True,
"size": 50
},
"spreadsheet_table_clustering": "default",
"add_page_markers": False,
"remove_text_formatting": False,
"return_ocr_data": False,
"filter_line_numbers": False,
"read_comments": False,
"persist_results": False,
"exclude_hidden_sheets": False,
"exclude_hidden_rows_cols": False,
"enable_change_tracking": False,
"enable_highlight_detection": False
},
"experimental_options": {
"enrich": {
"enabled": False,
"mode": "standard"
},
"layout_enrichment": False,
"native_office_conversion": False,
"enable_checkboxes": False,
"enable_equations": False,
"rotate_pages": True,
"rotate_figures": False,
"enable_scripts": False,
"return_figure_images": False,
"return_table_images": False,
"layout_model": "default",
"embed_text_metadata_pdf": False,
"detect_signatures": False,
"danger_filter_wide_boxes": False
},
"priority": True
}
headers = {
"Authorization": "Bearer <token>",
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
print(response.json()){
"job_id": "<string>",
"duration": 123,
"usage": {
"num_pages": 123,
"credits": 123
},
"result": {
"type": "<string>",
"chunks": [
{
"content": "<string>",
"embed": "<string>",
"enriched": "<string>",
"blocks": [
{
"type": "Header",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"content": "<string>",
"image_url": "<string>",
"confidence": "low",
"granular_confidence": {
"extract_confidence": 123,
"parse_confidence": 123
}
}
],
"enrichment_success": false
}
],
"ocr": {
"words": [
{
"text": "<string>",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"confidence": 123,
"chunk_index": 123
}
],
"lines": [
{
"text": "<string>",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"confidence": 123,
"chunk_index": 123
}
]
},
"custom": "<unknown>"
},
"pdf_url": "<string>",
"studio_link": "<string>"
}import requests
url = "https://platform.reducto.ai/parse"
payload = {
"document_url": "<string>",
"options": {
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": { "chunk_mode": "variable" },
"table_summary": { "enabled": False },
"figure_summary": {
"enabled": False,
"enhanced": False,
"override": False
},
"filter_blocks": [],
"force_url_result": False
},
"advanced_options": {
"ocr_system": "highres",
"table_output_format": "html",
"merge_tables": False,
"include_formula_information": False,
"include_color_information": False,
"continue_hierarchy": True,
"keep_line_breaks": False,
"page_range": {},
"large_table_chunking": {
"enabled": True,
"size": 50
},
"spreadsheet_table_clustering": "default",
"add_page_markers": False,
"remove_text_formatting": False,
"return_ocr_data": False,
"filter_line_numbers": False,
"read_comments": False,
"persist_results": False,
"exclude_hidden_sheets": False,
"exclude_hidden_rows_cols": False,
"enable_change_tracking": False,
"enable_highlight_detection": False
},
"experimental_options": {
"enrich": {
"enabled": False,
"mode": "standard"
},
"layout_enrichment": False,
"native_office_conversion": False,
"enable_checkboxes": False,
"enable_equations": False,
"rotate_pages": True,
"rotate_figures": False,
"enable_scripts": False,
"return_figure_images": False,
"return_table_images": False,
"layout_model": "default",
"embed_text_metadata_pdf": False,
"detect_signatures": False,
"danger_filter_wide_boxes": False
},
"priority": True
}
headers = {
"Authorization": "Bearer <token>",
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
print(response.json()){
"job_id": "<string>",
"duration": 123,
"usage": {
"num_pages": 123,
"credits": 123
},
"result": {
"type": "<string>",
"chunks": [
{
"content": "<string>",
"embed": "<string>",
"enriched": "<string>",
"blocks": [
{
"type": "Header",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"content": "<string>",
"image_url": "<string>",
"confidence": "low",
"granular_confidence": {
"extract_confidence": 123,
"parse_confidence": 123
}
}
],
"enrichment_success": false
}
],
"ocr": {
"words": [
{
"text": "<string>",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"confidence": 123,
"chunk_index": 123
}
],
"lines": [
{
"text": "<string>",
"bbox": {
"left": 123,
"top": 123,
"width": 123,
"height": 123,
"page": 123,
"original_page": 123
},
"confidence": 123,
"chunk_index": 123
}
]
},
"custom": "<unknown>"
},
"pdf_url": "<string>",
"studio_link": "<string>"
}Bearer authentication header of the form Bearer <token>, where <token> is your auth token.
The URL of the document to be processed. You can provide one of the following:
Show child attributes
The mode to use for OCR. Agentic mode adds an extra pass, correcting any table/text mistakes at a small cost.
standard, agentic The mode to use for extraction. Metadata/hybrid are only recommended with high quality metadata embeddings.
ocr, metadata, hybrid The configuration options for chunking. Chunking is commonly used for RAG usecases.
Show child attributes
Choose how to partition chunks. Variable mode chunks by character length and visual context. Section mode chunks by section headers. Page mode chunks according to pages. Page sections mode chunks first by page, then by sections within each page. Disabled returns one single chunk.
variable, section, page, block, disabled, page_sections The approximate size of chunks (in characters) that the document will be split into. Defaults to None, in which case the chunk size is variable between 250 - 1500 characters.
The configuration options for figure summarization.
Show child attributes
If figure summarization should be performed.
Add information to the prompt for figure summarization. Note any visual cues that should be incorporated. Example: 'When provided a diagram, extract all of the figure content verbatim.'
If the figure summary prompt should override our default prompt.
A list of block types to filter from chunk content. Pass blocks to filter them from content. By default, no blocks are filtered.
Header, Footer, Title, Section Header, Page Number, List Item, Figure, Table, Key Value, Text, Comment, Signature Force the result to be returned in URL form (by default only used for very large responses).
Show child attributes
The OCR system to use. Highres is recommended for documents with English characters. Legacy uses an alternative OCR backend.
highres, multilingual, combined, reducto, legacy The mode to use for table output. Dynamic returns md for simpler tables and html for more complex tables.
html, json, md, jsonbbox, dynamic, ai_json, csv A flag to indicate if consecutive tables with the same number of columns should be merged across breaks and spaces.
If True, preserve formula information in spreadsheet cells by wrapping text with LaTeX formula commands during parsing.
If True, preserve Excel cell colours in the extracted spreadsheet text using LaTeX colour commands.
A flag to indicate if the hierarchy of the document should be continued from chunk to chunk.
If line breaks should be preserved in the text.
The page range to process (1-indexed). By default, the entire document is processed.
Force the URL to be downloaded as a specific file extension (e.g. .png).
The configuration options for large table chunking (currently only supported on spreadsheet and CSV files).
Show child attributes
If large tables should be chunked into smaller tables, currently only supported on spreadsheet and CSV files.
The max row/column size for a table to be chunked. Defaults to 50. Header rows/columns are persisted based on heuristics.
In a spreadsheet with different tables inside, we enable splitting up the tables by default. Intelligent mode applies more powerful models for superior accuracy, at 5× the default per-cell rate. Disabling will register as one large table.
default, disabled, intelligent If True, add page markers to the output (e.g. [[PAGE 1 BEGINS HERE]] and [[PAGE 1 ENDS HERE]] added as blocks to the content). Defaults to False.
If True, remove text formatting from the output (e.g. hyphens for list items). Defaults to False.
If True, return OCR data in the result. Defaults to False.
Password to decrypt password-protected documents.
If True, filter out line numbers from the output. Defaults to False.
If True, pull in PDF comments from the document. Defaults to False.
If True, persist the results indefinitely. Defaults to False.
Skip hidden sheets in Excel files. Defaults to False.
Skip hidden rows and cols in Excel files. Defaults to False.
Enables model-based detection of underlines and strikethroughs, adding <u>/<s> tags to OCR text. Works with any extraction mode. Defaults to False.
If True, enable highlight detection. Highlighted text will be surrounded by <mark> tags in the output. Defaults to False.
Show child attributes
The configuration options for enrichment.
Show child attributes
If enabled, a large language/vision model will be used to postprocess the extracted content. Note: enabling enrich requires tables be outputted in markdown format. Defaults to False.
The mode to use for enrichment. Defaults to standard
standard, page, table Add information to the prompt for enrichment.
Layout enrichment is a beta feature that improves our layout and reading order performance at the cost of increased latency. Defaults to False.
Instead of using LibreOffice, when enabled, this flag uses a Windows VM to convert files. This is slower but more accurate.
Use an experimental checkbox detection model to add checkboxes to the output, defaults to False
Use an experimental equation detection model to add equations to the output, defaults to False
Use an orientation model to detect and rotate pages as needed, defaults to True
Use an orientation model to detect and rotate figures as needed, defaults to False
Add <sub> tag around subscripts and <sup> tag around superscripts, defaults to False
If figure images should be returned in the result. Defaults to False.
If table images should be returned in the result. Defaults to False.
The layout model to use for the document. This will be deprecated in the future.
default, beta If extracted OCR text metadata should be embedded back into the returned PDF, overwriting any existing text. Defaults to False.
If True, detect signatures in the document. Defaults to False.
You probably shouldn't use this. If True, filter out boxes with width greater than 50% of the document width. Defaults to False. You probably don't want to use this.
A user specified timeout, defaults to None
If True, attempts to process the job with priority if the user has priority processing budget available; by default, sync jobs are prioritized above async jobs.
Successful Response
The duration of the parse request in seconds.
The response from the document processing service. Note that there can be two types of responses, Full Result and URL Result. This is due to limitations on the max return size on HTTPS. If the response is too large, it will be returned as a presigned URL in the URL response. You should handle this in your application.
Show child attributes
type = 'full'
"full"Show child attributes
The content of the chunk extracted from the document.
Chunk content optimized for embedding and retrieval.
The enriched content of the chunk extracted from the document.
Show child attributes
The type of block extracted from the document.
Header, Footer, Title, Section Header, Page Number, List Item, Figure, Table, Key Value, Text, Comment, Signature The bounding box of the block extracted from the document.
Show child attributes
The page number of the bounding box (1-indexed).
The page number in the original document of the bounding box (1-indexed).
The content of the block extracted from the document.
(Experimental) The URL of the image associated with the block.
The confidence for the block. It is either low or high and takes into account factors like OCR and table structure
Granular confidence scores for the block. It is a dictionary of confidence scores for the block. The confidence scores will not be None if the user has enabled numeric confidence scores.
Whether the enrichment was successful.
Show child attributes
Show child attributes
Show child attributes
The page number of the bounding box (1-indexed).
The page number in the original document of the bounding box (1-indexed).
OCR confidence score between 0 and 1, where 1 indicates highest confidence
The index of the chunk that the word belongs to.
Show child attributes
Show child attributes
The page number of the bounding box (1-indexed).
The page number in the original document of the bounding box (1-indexed).
OCR confidence score between 0 and 1, where 1 indicates highest confidence
The index of the chunk that the line belongs to.
The storage URL of the converted PDF file.
The link to the studio pipeline for the document.
Was this page helpful?