Parse - Reducto API

Parse

import requests

url = "https://platform.reducto.ai/parse"

payload = {
    "input": "<string>",
    "enhance": {
        "agentic": [],
        "summarize_figures": True
    },
    "retrieval": {
        "chunking": { "chunk_mode": "disabled" },
        "filter_blocks": [],
        "embedding_optimized": False
    },
    "formatting": {
        "add_page_markers": False,
        "table_output_format": "dynamic",
        "merge_tables": False,
        "include": []
    },
    "spreadsheet": {
        "split_large_tables": {
            "enabled": True,
            "size": 50
        },
        "include": [],
        "clustering": "accurate",
        "exclude": []
    },
    "settings": {
        "ocr_system": "standard",
        "extraction_mode": "hybrid",
        "force_url_result": False,
        "return_ocr_data": False,
        "return_images": [],
        "embed_pdf_metadata": False,
        "persist_results": False
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

{
  "job_id": "<string>",
  "duration": 123,
  "usage": {
    "num_pages": 123,
    "credits": 123
  },
  "result": {
    "type": "<string>",
    "chunks": [
      {
        "content": "<string>",
        "embed": "<string>",
        "enriched": "<string>",
        "blocks": [
          {
            "type": "Header",
            "bbox": {
              "left": 123,
              "top": 123,
              "width": 123,
              "height": 123,
              "page": 123,
              "original_page": 123
            },
            "content": "<string>",
            "image_url": "<string>",
            "chart_data": [
              "<string>"
            ],
            "confidence": "low",
            "granular_confidence": {
              "extract_confidence": 123,
              "parse_confidence": 123
            },
            "extra": {}
          }
        ],
        "enrichment_success": false
      }
    ],
    "ocr": {
      "words": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123,
          "rotation": 123
        }
      ],
      "lines": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123,
          "rotation": 123
        }
      ]
    },
    "custom": "<unknown>"
  },
  "pdf_url": "<string>",
  "studio_link": "<string>"
}

POST

parse

Parse

import requests

url = "https://platform.reducto.ai/parse"

payload = {
    "input": "<string>",
    "enhance": {
        "agentic": [],
        "summarize_figures": True
    },
    "retrieval": {
        "chunking": { "chunk_mode": "disabled" },
        "filter_blocks": [],
        "embedding_optimized": False
    },
    "formatting": {
        "add_page_markers": False,
        "table_output_format": "dynamic",
        "merge_tables": False,
        "include": []
    },
    "spreadsheet": {
        "split_large_tables": {
            "enabled": True,
            "size": 50
        },
        "include": [],
        "clustering": "accurate",
        "exclude": []
    },
    "settings": {
        "ocr_system": "standard",
        "extraction_mode": "hybrid",
        "force_url_result": False,
        "return_ocr_data": False,
        "return_images": [],
        "embed_pdf_metadata": False,
        "persist_results": False
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

{
  "job_id": "<string>",
  "duration": 123,
  "usage": {
    "num_pages": 123,
    "credits": 123
  },
  "result": {
    "type": "<string>",
    "chunks": [
      {
        "content": "<string>",
        "embed": "<string>",
        "enriched": "<string>",
        "blocks": [
          {
            "type": "Header",
            "bbox": {
              "left": 123,
              "top": 123,
              "width": 123,
              "height": 123,
              "page": 123,
              "original_page": 123
            },
            "content": "<string>",
            "image_url": "<string>",
            "chart_data": [
              "<string>"
            ],
            "confidence": "low",
            "granular_confidence": {
              "extract_confidence": 123,
              "parse_confidence": 123
            },
            "extra": {}
          }
        ],
        "enrichment_success": false
      }
    ],
    "ocr": {
      "words": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123,
          "rotation": 123
        }
      ],
      "lines": [
        {
          "text": "<string>",
          "bbox": {
            "left": 123,
            "top": 123,
            "width": 123,
            "height": 123,
            "page": 123,
            "original_page": 123
          },
          "confidence": 123,
          "chunk_index": 123,
          "rotation": 123
        }
      ]
    },
    "custom": "<unknown>"
  },
  "pdf_url": "<string>",
  "studio_link": "<string>"
}

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Body

application/json

SyncParseConfig
AsyncParseConfig

input

required

For parse/split/extract pipelines, the URL of the document to be processed. You can provide one of the following: 1. A publicly available URL 2. A presigned S3 URL 3. A reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document 4. A jobid:// prefixed URL obtained from a previous /parse invocation

For edit pipelines, this should be a string containing the edit instructions

enhance

Enhance · object

Show child attributes

retrieval

Retrieval · object

Show child attributes

formatting

Formatting · object

Show child attributes

spreadsheet

Spreadsheet · object

Show child attributes

settings

Settings · object

Show child attributes

Response

Successful Response

ParseResponse
AsyncParseResponse

job_id

string

required

duration

number

required

The duration of the parse request in seconds.

usage

ParseUsage · object

required

Show child attributes

result

FullResult · object

required

The response from the document processing service. Note that there can be two types of responses, Full Result and URL Result. This is due to limitations on the max return size on HTTPS. If the response is too large, it will be returned as a presigned URL in the URL response. You should handle this in your application.

FullResult
UrlResult

Show child attributes

pdf_url

string | null

The storage URL of the converted PDF file.

studio_link

string | null

The link to the studio pipeline for the document.

Pipeline Async Extract

⌘I