🚀 Our new and improved config V3 is now live! See API reference for details.
import requests
url = "https://platform.reducto.ai/split_async"
payload = {
"split_description": [
{
"name": "<string>",
"description": "<string>",
"partition_key": "<string>"
}
],
"document_url": "<string>",
"options": {
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": { "chunk_mode": "variable" },
"table_summary": { "enabled": False },
"figure_summary": {
"enabled": False,
"enhanced": False,
"override": False
},
"filter_blocks": [],
"force_url_result": False
},
"advanced_options": {
"ocr_system": "highres",
"table_output_format": "html",
"merge_tables": False,
"include_formula_information": False,
"include_color_information": False,
"continue_hierarchy": True,
"keep_line_breaks": False,
"page_range": {},
"large_table_chunking": {
"enabled": True,
"size": 50
},
"spreadsheet_table_clustering": "default",
"add_page_markers": False,
"remove_text_formatting": False,
"return_ocr_data": False,
"filter_line_numbers": False,
"read_comments": False,
"persist_results": False,
"exclude_hidden_sheets": False,
"exclude_hidden_rows_cols": False,
"enable_change_tracking": False,
"enable_highlight_detection": False
},
"experimental_options": {
"enrich": {
"enabled": False,
"mode": "standard"
},
"layout_enrichment": False,
"native_office_conversion": False,
"enable_checkboxes": False,
"enable_equations": False,
"rotate_pages": True,
"rotate_figures": False,
"enable_scripts": False,
"return_figure_images": False,
"return_table_images": False,
"layout_model": "default",
"embed_text_metadata_pdf": False,
"detect_signatures": False,
"danger_filter_wide_boxes": False
},
"split_rules": "Split the document into the applicable sections. Sections may only overlap at their first and last page if at all.",
"priority": False,
"split_options": { "table_cutoff": "truncate" },
"webhook": {
"mode": "disabled",
"channels": []
}
}
headers = {
"Authorization": "Bearer <token>",
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
print(response.json()){
"job_id": "<string>"
}import requests
url = "https://platform.reducto.ai/split_async"
payload = {
"split_description": [
{
"name": "<string>",
"description": "<string>",
"partition_key": "<string>"
}
],
"document_url": "<string>",
"options": {
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": { "chunk_mode": "variable" },
"table_summary": { "enabled": False },
"figure_summary": {
"enabled": False,
"enhanced": False,
"override": False
},
"filter_blocks": [],
"force_url_result": False
},
"advanced_options": {
"ocr_system": "highres",
"table_output_format": "html",
"merge_tables": False,
"include_formula_information": False,
"include_color_information": False,
"continue_hierarchy": True,
"keep_line_breaks": False,
"page_range": {},
"large_table_chunking": {
"enabled": True,
"size": 50
},
"spreadsheet_table_clustering": "default",
"add_page_markers": False,
"remove_text_formatting": False,
"return_ocr_data": False,
"filter_line_numbers": False,
"read_comments": False,
"persist_results": False,
"exclude_hidden_sheets": False,
"exclude_hidden_rows_cols": False,
"enable_change_tracking": False,
"enable_highlight_detection": False
},
"experimental_options": {
"enrich": {
"enabled": False,
"mode": "standard"
},
"layout_enrichment": False,
"native_office_conversion": False,
"enable_checkboxes": False,
"enable_equations": False,
"rotate_pages": True,
"rotate_figures": False,
"enable_scripts": False,
"return_figure_images": False,
"return_table_images": False,
"layout_model": "default",
"embed_text_metadata_pdf": False,
"detect_signatures": False,
"danger_filter_wide_boxes": False
},
"split_rules": "Split the document into the applicable sections. Sections may only overlap at their first and last page if at all.",
"priority": False,
"split_options": { "table_cutoff": "truncate" },
"webhook": {
"mode": "disabled",
"channels": []
}
}
headers = {
"Authorization": "Bearer <token>",
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
print(response.json()){
"job_id": "<string>"
}Bearer authentication header of the form Bearer <token>, where <token> is your auth token.
The URL of the document to be processed. You can provide one of the following:
Show child attributes
The mode to use for OCR. Agentic mode adds an extra pass, correcting any table/text mistakes at a small cost.
standard, agentic The mode to use for extraction. Metadata/hybrid are only recommended with high quality metadata embeddings.
ocr, metadata, hybrid The configuration options for chunking. Chunking is commonly used for RAG usecases.
Show child attributes
Choose how to partition chunks. Variable mode chunks by character length and visual context. Section mode chunks by section headers. Page mode chunks according to pages. Page sections mode chunks first by page, then by sections within each page. Disabled returns one single chunk.
variable, section, page, block, disabled, page_sections The approximate size of chunks (in characters) that the document will be split into. Defaults to None, in which case the chunk size is variable between 250 - 1500 characters.
The configuration options for figure summarization.
Show child attributes
If figure summarization should be performed.
Add information to the prompt for figure summarization. Note any visual cues that should be incorporated. Example: 'When provided a diagram, extract all of the figure content verbatim.'
If the figure summary prompt should override our default prompt.
A list of block types to filter from chunk content. Pass blocks to filter them from content. By default, no blocks are filtered.
Header, Footer, Title, Section Header, Page Number, List Item, Figure, Table, Key Value, Text, Comment, Signature Force the result to be returned in URL form (by default only used for very large responses).
Show child attributes
The OCR system to use. Highres is recommended for documents with English characters. Legacy uses an alternative OCR backend.
highres, multilingual, combined, reducto, legacy The mode to use for table output. Dynamic returns md for simpler tables and html for more complex tables.
html, json, md, jsonbbox, dynamic, ai_json, csv A flag to indicate if consecutive tables with the same number of columns should be merged across breaks and spaces.
If True, preserve formula information in spreadsheet cells by wrapping text with LaTeX formula commands during parsing.
If True, preserve Excel cell colours in the extracted spreadsheet text using LaTeX colour commands.
A flag to indicate if the hierarchy of the document should be continued from chunk to chunk.
If line breaks should be preserved in the text.
The page range to process (1-indexed). By default, the entire document is processed.
Force the URL to be downloaded as a specific file extension (e.g. .png).
The configuration options for large table chunking (currently only supported on spreadsheet and CSV files).
Show child attributes
If large tables should be chunked into smaller tables, currently only supported on spreadsheet and CSV files.
The max row/column size for a table to be chunked. Defaults to 50. Header rows/columns are persisted based on heuristics.
In a spreadsheet with different tables inside, we enable splitting up the tables by default. Intelligent mode applies more powerful models for superior accuracy, at 5× the default per-cell rate. Disabling will register as one large table.
default, disabled, intelligent If True, add page markers to the output (e.g. [[PAGE 1 BEGINS HERE]] and [[PAGE 1 ENDS HERE]] added as blocks to the content). Defaults to False.
If True, remove text formatting from the output (e.g. hyphens for list items). Defaults to False.
If True, return OCR data in the result. Defaults to False.
Password to decrypt password-protected documents.
If True, filter out line numbers from the output. Defaults to False.
If True, pull in PDF comments from the document. Defaults to False.
If True, persist the results indefinitely. Defaults to False.
Skip hidden sheets in Excel files. Defaults to False.
Skip hidden rows and cols in Excel files. Defaults to False.
Enables model-based detection of underlines and strikethroughs, adding <u>/<s> tags to OCR text. Works with any extraction mode. Defaults to False.
If True, enable highlight detection. Highlighted text will be surrounded by <mark> tags in the output. Defaults to False.
Show child attributes
The configuration options for enrichment.
Show child attributes
If enabled, a large language/vision model will be used to postprocess the extracted content. Note: enabling enrich requires tables be outputted in markdown format. Defaults to False.
The mode to use for enrichment. Defaults to standard
standard, page, table Add information to the prompt for enrichment.
Layout enrichment is a beta feature that improves our layout and reading order performance at the cost of increased latency. Defaults to False.
Instead of using LibreOffice, when enabled, this flag uses a Windows VM to convert files. This is slower but more accurate.
Use an experimental checkbox detection model to add checkboxes to the output, defaults to False
Use an experimental equation detection model to add equations to the output, defaults to False
Use an orientation model to detect and rotate pages as needed, defaults to True
Use an orientation model to detect and rotate figures as needed, defaults to False
Add <sub> tag around subscripts and <sup> tag around superscripts, defaults to False
If figure images should be returned in the result. Defaults to False.
If table images should be returned in the result. Defaults to False.
The layout model to use for the document. This will be deprecated in the future.
default, beta If extracted OCR text metadata should be embedded back into the returned PDF, overwriting any existing text. Defaults to False.
If True, detect signatures in the document. Defaults to False.
You probably shouldn't use this. If True, filter out boxes with width greater than 50% of the document width. Defaults to False. You probably don't want to use this.
A user specified timeout, defaults to None
The prompt that describes rules for splitting the document.
If True, attempts to process the job with priority if the user has priority processing budget available; by default, sync jobs are prioritized above async jobs.
Show child attributes
If tables should be truncated to the first few rows or if all content should be preserved. truncate improves latency, preserve is recommended for cases where partition_key is being used and the partition_key may be included within the table. Defaults to truncate
truncate, preserve Show child attributes
The mode to use for webhook delivery. Defaults to 'disabled'. We recommend using 'svix' for production environments.
disabled, svix, direct The URL to send the webhook to (if using direct webhoook).
JSON metadata included in webhook request body
A list of Svix channels the message will be delivered down, omit to send to all channels.
Successful Response
Was this page helpful?