Parse

Parse

import requests

url = "https://platform.reducto.ai/parse"

payload = {
    "document_url": "<string>",
    "options": {
        "ocr_mode": "standard",
        "extraction_mode": "ocr",
        "chunking": { "chunk_mode": "variable" },
        "table_summary": { "enabled": False },
        "figure_summary": {
            "enabled": False,
            "enhanced": False,
            "override": False
        },
        "filter_blocks": [],
        "force_url_result": False
    },
    "advanced_options": {
        "ocr_system": "highres",
        "table_output_format": "html",
        "merge_tables": False,
        "include_formula_information": False,
        "include_color_information": False,
        "continue_hierarchy": True,
        "keep_line_breaks": False,
        "page_range": {},
        "large_table_chunking": {
            "enabled": True,
            "size": 50
        },
        "spreadsheet_table_clustering": "default",
        "add_page_markers": False,
        "remove_text_formatting": False,
        "return_ocr_data": False,
        "filter_line_numbers": False,
        "read_comments": False,
        "persist_results": False,
        "exclude_hidden_sheets": False,
        "exclude_hidden_rows_cols": False,
        "enable_change_tracking": False,
        "enable_highlight_detection": False
    },
    "experimental_options": {
        "enrich": {
            "enabled": False,
            "mode": "standard"
        },
        "layout_enrichment": False,
        "native_office_conversion": False,
        "enable_checkboxes": False,
        "enable_equations": False,
        "rotate_pages": True,
        "rotate_figures": False,
        "enable_scripts": False,
        "return_figure_images": False,
        "return_table_images": False,
        "layout_model": "default",
        "embed_text_metadata_pdf": False,
        "detect_signatures": False,
        "danger_filter_wide_boxes": False
    },
    "priority": True
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

{
  "job_id": "<string>",
  "duration": 123,
  "usage": {
    "num_pages": 123,
    "credits": 123
  },
  "result": {
    "type": "<string>",
    "chunks": [
      {
        "content": "<string>",
        "embed": "<string>",
        "enriched": "<string>",
        "blocks": [
          {
            "type": "Header",
            "bbox": {
              "left": 123,
              "top": 123,
              "width": 123,
              "height": 123,
              "page": 123,
              "original_page": 123
            },
            "content": "<string>",
            "image_url": "<string>",
            "confidence": "low",
            "granular_confidence": {
              "extract_confidence": 123,
              "parse_confidence": 123
            }
          }
        ],
        "enrichment_success": false
      }
    ],
    "ocr": {
      "words": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123
        }
      ],
      "lines": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123
        }
      ]
    },
    "custom": null
  },
  "pdf_url": "<string>",
  "studio_link": "<string>"
}

POST

parse

Parse

import requests

url = "https://platform.reducto.ai/parse"

payload = {
    "document_url": "<string>",
    "options": {
        "ocr_mode": "standard",
        "extraction_mode": "ocr",
        "chunking": { "chunk_mode": "variable" },
        "table_summary": { "enabled": False },
        "figure_summary": {
            "enabled": False,
            "enhanced": False,
            "override": False
        },
        "filter_blocks": [],
        "force_url_result": False
    },
    "advanced_options": {
        "ocr_system": "highres",
        "table_output_format": "html",
        "merge_tables": False,
        "include_formula_information": False,
        "include_color_information": False,
        "continue_hierarchy": True,
        "keep_line_breaks": False,
        "page_range": {},
        "large_table_chunking": {
            "enabled": True,
            "size": 50
        },
        "spreadsheet_table_clustering": "default",
        "add_page_markers": False,
        "remove_text_formatting": False,
        "return_ocr_data": False,
        "filter_line_numbers": False,
        "read_comments": False,
        "persist_results": False,
        "exclude_hidden_sheets": False,
        "exclude_hidden_rows_cols": False,
        "enable_change_tracking": False,
        "enable_highlight_detection": False
    },
    "experimental_options": {
        "enrich": {
            "enabled": False,
            "mode": "standard"
        },
        "layout_enrichment": False,
        "native_office_conversion": False,
        "enable_checkboxes": False,
        "enable_equations": False,
        "rotate_pages": True,
        "rotate_figures": False,
        "enable_scripts": False,
        "return_figure_images": False,
        "return_table_images": False,
        "layout_model": "default",
        "embed_text_metadata_pdf": False,
        "detect_signatures": False,
        "danger_filter_wide_boxes": False
    },
    "priority": True
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

{
  "job_id": "<string>",
  "duration": 123,
  "usage": {
    "num_pages": 123,
    "credits": 123
  },
  "result": {
    "type": "<string>",
    "chunks": [
      {
        "content": "<string>",
        "embed": "<string>",
        "enriched": "<string>",
        "blocks": [
          {
            "type": "Header",
            "bbox": {
              "left": 123,
              "top": 123,
              "width": 123,
              "height": 123,
              "page": 123,
              "original_page": 123
            },
            "content": "<string>",
            "image_url": "<string>",
            "confidence": "low",
            "granular_confidence": {
              "extract_confidence": 123,
              "parse_confidence": 123
            }
          }
        ],
        "enrichment_success": false
      }
    ],
    "ocr": {
      "words": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123
        }
      ],
      "lines": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123
        }
      ]
    },
    "custom": null
  },
  "pdf_url": "<string>",
  "studio_link": "<string>"
}

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Body

application/json

document_url

required

The URL of the document to be processed. You can provide one of the following:

A publicly available URL
A presigned S3 URL
A reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document

options

BaseProcessingOptions · object

Show child attributes

advanced_options

AdvancedProcessingOptions · object

Show child attributes

experimental_options

ExperimentalProcessingOptions · object

Show child attributes

priority

boolean

default:true

If True, attempts to process the job with priority if the user has priority processing budget available; by default, sync jobs are prioritized above async jobs.

Response

Successful Response

ParseResponse
AsyncParseResponse

job_id

string

required

duration

number

required

The duration of the parse request in seconds.

usage

ParseUsage · object

required

Show child attributes

result

FullResult · object

required

The response from the document processing service. Note that there can be two types of responses, Full Result and URL Result. This is due to limitations on the max return size on HTTPS. If the response is too large, it will be returned as a presigned URL in the URL response. You should handle this in your application.

FullResult
UrlResult

Show child attributes

pdf_url

string | null

The storage URL of the converted PDF file.

studio_link

string | null

The link to the studio pipeline for the document.

Parse Async

⌘I

Document Processing

Job Management

Utilities

Authorizations

Body

Response