> ## Documentation Index
> Fetch the complete documentation index at: https://docs.reducto.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Extract


## OpenAPI

````yaml openapi-legacy.json post /extract
openapi: 3.1.0
info:
  title: Reducto API (Legacy)
  version: v1.11.81-297-g4204a908d
servers:
  - url: https://platform.reducto.ai
security: []
paths:
  /extract:
    post:
      summary: Extract
      operationId: extract_extract_post
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ExtractConfigNew'
        required: true
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                anyOf:
                  - $ref: '#/components/schemas/ExtractResponse'
                  - $ref: '#/components/schemas/V3ExtractResponse'
                title: Response Extract Extract Post
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      security:
        - SkippableHTTPBearer: []
components:
  schemas:
    ExtractConfigNew:
      properties:
        options:
          $ref: '#/components/schemas/BaseProcessingOptions'
        advanced_options:
          $ref: '#/components/schemas/AdvancedProcessingOptions'
        experimental_options:
          $ref: '#/components/schemas/ExperimentalProcessingOptions'
        schema:
          title: Schema
          description: The JSON schema to use for extraction.
        system_prompt:
          type: string
          title: System Prompt
          description: >-
            A system prompt to use for the extraction. This is a general prompt
            that is applied to the entire document before any other prompts.
          default: Be precise and thorough.
        deep_extract:
          type: boolean
          title: Deep Extract
          description: >-
            If True, use Deep Extract, an agentic extraction mode that
            iteratively refines its output to achieve near-perfect accuracy.
            Best for complex documents where accuracy is critical.
          default: false
        generate_citations:
          type: boolean
          title: Generate Citations
          description: If citations should be generated for the extracted content.
          default: false
        array_extract:
          $ref: '#/components/schemas/ArrayExtractConfig'
          description: The configuration options for array extract
          default:
            enabled: false
            mode: legacy
            pages_per_segment: 10
        use_chunking:
          type: boolean
          title: Use Chunking
          description: If chunking should be used for the extraction. Defaults to False.
          default: false
        include_images:
          type: boolean
          title: Include Images
          description: >-
            If images should be passed directly for extractions. Can only be
            enabled for documents with less than 10 pages. Defaults to False.
          default: false
        spreadsheet_agent:
          type: boolean
          title: Spreadsheet Agent
          description: If spreadsheet agent should be used for extraction.
          default: false
        experimental_table_citations:
          type: boolean
          title: Experimental Table Citations
          description: If table citations should be generated for the extracted content.
          default: true
        priority:
          type: boolean
          title: Priority
          description: >-
            If True, attempts to process the job with priority if the user has
            priority processing budget available; by default, sync jobs are
            prioritized above async jobs.
          default: true
        citations_options:
          $ref: '#/components/schemas/AdvancedCitationsConfig'
          description: The configuration options for citations.
          default:
            numerical_confidence: false
            parent_block: full
        agent_extract:
          $ref: '#/components/schemas/AgentExtractConfig'
          description: The configuration options for agent extract
          default:
            enabled: false
        document_url:
          anyOf:
            - type: string
            - items:
                type: string
              type: array
            - $ref: '#/components/schemas/UploadResponse'
          title: Document Url
          description: >
            The URL of the document to be processed. You can provide one of the
            following:

            1. A publicly available URL

            2. A presigned S3 URL

            3. A reducto:// prefixed URL obtained from the /upload endpoint
            after directly uploading a document

            4. A job_id (jobid://) or a list of job_ids (jobid://) obtained from
            a previous /parse endpoint
      additionalProperties: false
      type: object
      required:
        - schema
        - document_url
      title: ExtractConfigNew
    ExtractResponse:
      additionalProperties: true
      type: object
    V3ExtractResponse:
      additionalProperties: true
      type: object
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    BaseProcessingOptions:
      properties:
        ocr_mode:
          type: string
          enum:
            - standard
            - agentic
          title: Ocr Mode
          description: >-
            The mode to use for OCR. Agentic mode adds an extra pass, correcting
            any table/text mistakes at a small cost.
          default: standard
        extraction_mode:
          type: string
          enum:
            - ocr
            - metadata
            - hybrid
          title: Extraction Mode
          description: >-
            The mode to use for extraction. Metadata/hybrid are only recommended
            with high quality metadata embeddings.
          default: ocr
        chunking:
          $ref: '#/components/schemas/ChunkingConfig'
          description: >-
            The configuration options for chunking. Chunking is commonly used
            for RAG usecases.
          default:
            chunk_mode: variable
            chunk_overlap: 0
        table_summary:
          $ref: '#/components/schemas/TableSummaryConfig'
          description: The configuration options for table summarization.
          default:
            enabled: false
        figure_summary:
          $ref: '#/components/schemas/FigureSummaryConfig'
          description: The configuration options for figure summarization.
        filter_blocks:
          items:
            type: string
            enum:
              - Header
              - Footer
              - Title
              - Section Header
              - Page Number
              - List Item
              - Figure
              - Table
              - Key Value
              - Text
              - Comment
              - Signature
          type: array
          title: Filter Blocks
          description: >-
            A list of block types to filter from chunk content. Pass blocks to
            filter them from content. By default, no blocks are filtered.
          default: []
        force_url_result:
          type: boolean
          title: Force Url Result
          description: >-
            Force the result to be returned in URL form (by default only used
            for very large responses).
          default: false
      type: object
      title: BaseProcessingOptions
    AdvancedProcessingOptions:
      properties:
        ocr_system:
          type: string
          enum:
            - highres
            - multilingual
            - combined
            - reducto
            - legacy
          title: Ocr System
          description: >-
            The OCR system to use. Highres is recommended for documents with
            English characters. Legacy uses an alternative OCR backend.
          default: highres
        table_output_format:
          type: string
          enum:
            - html
            - json
            - md
            - jsonbbox
            - dynamic
            - ai_json
            - csv
          title: Table Output Format
          description: >-
            The mode to use for table output. Dynamic returns md for simpler
            tables and html for more complex tables.
          default: html
        merge_tables:
          type: boolean
          title: Merge Tables
          description: >-
            A flag to indicate if consecutive tables with the same number of
            columns should be merged across breaks and spaces.
          default: false
        include_formula_information:
          type: boolean
          title: Include Formula Information
          description: >-
            If True, preserve formula information in spreadsheet cells by
            wrapping text with LaTeX formula commands during parsing.
          default: false
        include_color_information:
          type: boolean
          title: Include Color Information
          description: >-
            If True, preserve Excel cell colours in the extracted spreadsheet
            text using LaTeX colour commands.
          default: false
        include_dropdown_information:
          type: boolean
          title: Include Dropdown Information
          description: >-
            If True, include dropdown options and the selected value when
            rendering spreadsheet cells.
          default: false
        continue_hierarchy:
          type: boolean
          title: Continue Hierarchy
          description: >-
            A flag to indicate if the hierarchy of the document should be
            continued from chunk to chunk.
          default: true
        keep_line_breaks:
          type: boolean
          title: Keep Line Breaks
          description: If line breaks should be preserved in the text.
          default: false
        page_range:
          anyOf:
            - $ref: '#/components/schemas/PageRange'
            - items:
                $ref: '#/components/schemas/PageRange'
              type: array
            - items:
                type: integer
              type: array
            - items:
                type: string
              type: array
          title: Page Range
          description: >-
            The page range to process (1-indexed). By default, the entire
            document is processed. For spreadsheets, you can also provide a list
            of sheet names.
          default: {}
        force_file_extension:
          type: string
          title: Force File Extension
          description: >-
            Force the URL to be downloaded as a specific file extension (e.g.
            .png).
        large_table_chunking:
          $ref: '#/components/schemas/LargeTableChunkingConfig'
          description: >-
            The configuration options for large table chunking (currently only
            supported on spreadsheet and CSV files).
          default:
            enabled: true
            size: 50
        spreadsheet_table_clustering:
          type: string
          enum:
            - default
            - disabled
            - intelligent
          title: Spreadsheet Table Clustering
          description: >-
            In a spreadsheet with different tables inside, we enable splitting
            up the tables by default. Intelligent mode applies more powerful
            models for superior accuracy, at 5× the default per-cell rate.
            Disabling will register as one large table.
          default: default
        max_cell_count:
          anyOf:
            - type: integer
              minimum: 1
            - type: 'null'
          title: Max Cell Count
          description: >-
            Maximum total non-empty cells allowed across all sheets. If
            exceeded, the request is rejected with a 422 error. Set to null to
            disable the limit. Defaults to null.
        add_page_markers:
          type: boolean
          title: Add Page Markers
          description: >-
            If True, add page markers to the output (e.g. [[PAGE 1 BEGINS HERE]]
            and [[PAGE 1 ENDS HERE]] added as blocks to the content). Defaults
            to False.
          default: false
        remove_text_formatting:
          type: boolean
          title: Remove Text Formatting
          description: >-
            If True, remove text formatting from the output (e.g. hyphens for
            list items). Defaults to False.
          default: false
        return_ocr_data:
          type: boolean
          title: Return Ocr Data
          description: If True, return OCR data in the result. Defaults to False.
          default: false
        document_password:
          type: string
          title: Document Password
          description: Password to decrypt password-protected documents.
        filter_line_numbers:
          type: boolean
          title: Filter Line Numbers
          description: If True, filter out line numbers from the output. Defaults to False.
          default: false
        read_comments:
          type: boolean
          title: Read Comments
          description: If True, pull in PDF comments from the document. Defaults to False.
          default: false
        persist_results:
          type: boolean
          title: Persist Results
          description: If True, persist the results indefinitely. Defaults to False.
          default: false
        exclude_hidden_sheets:
          type: boolean
          title: Exclude Hidden Sheets
          description: Skip hidden sheets in Excel files. Defaults to False.
          default: false
        exclude_hidden_rows_cols:
          type: boolean
          title: Exclude Hidden Rows Cols
          description: Skip hidden rows and cols in Excel files. Defaults to False.
          default: false
        enable_change_tracking:
          type: boolean
          title: Enable Change Tracking
          description: >-
            Enables model-based detection of underlines and strikethroughs,
            adding <u>/<s> tags to OCR text. Works with any extraction mode.
            Defaults to False.
          default: false
        enable_highlight_detection:
          type: boolean
          title: Enable Highlight Detection
          description: >-
            If True, enable highlight detection. Highlighted text will be
            surrounded by <mark> tags in the output. Defaults to False.
          default: false
        ignore_watermarks:
          type: boolean
          title: Ignore Watermarks
          description: >-
            If True, ignore and remove watermarks from OCR output. Defaults to
            False.
          default: false
      type: object
      title: AdvancedProcessingOptions
    ExperimentalProcessingOptions:
      properties:
        enrich:
          $ref: '#/components/schemas/EnrichConfig'
          description: The configuration options for enrichment.
          default:
            enabled: false
            mode: standard
        layout_enrichment:
          type: boolean
          title: Layout Enrichment
          description: >-
            Layout enrichment is a beta feature that improves our layout and
            reading order performance at the cost of increased latency. Defaults
            to False.
          default: false
        enable_checkboxes:
          type: boolean
          title: Enable Checkboxes
          description: >-
            Use an experimental checkbox detection model to add checkboxes to
            the output, defaults to False
          default: false
        enable_equations:
          type: boolean
          title: Enable Equations
          description: >-
            Use an experimental equation detection model to add equations to the
            output, defaults to False
          default: false
        rotate_pages:
          type: boolean
          title: Rotate Pages
          description: >-
            Use an orientation model to detect and rotate pages as needed,
            defaults to True
          default: true
        rotate_figures:
          type: boolean
          title: Rotate Figures
          description: >-
            Use an orientation model to detect and rotate figures as needed,
            defaults to False
          default: false
        enable_scripts:
          type: boolean
          title: Enable Scripts
          description: >-
            Add <sub> tag around subscripts and <sup> tag around superscripts,
            defaults to False
          default: false
        return_figure_images:
          type: boolean
          title: Return Figure Images
          description: >-
            If figure images should be returned in the result. Defaults to
            False.
          default: false
        return_table_images:
          type: boolean
          title: Return Table Images
          description: If table images should be returned in the result. Defaults to False.
          default: false
        return_page_images:
          type: boolean
          title: Return Page Images
          description: >-
            If full page images should be returned in the result. Defaults to
            False.
          default: false
        layout_model:
          type: string
          enum:
            - default
            - beta
          title: Layout Model
          description: >-
            The layout model to use for the document. This will be deprecated in
            the future.
          default: default
        embed_text_metadata_pdf:
          type: boolean
          title: Embed Text Metadata Pdf
          description: >-
            If extracted OCR text metadata should be embedded back into the
            returned PDF, overwriting any existing text. Defaults to False.
          default: false
        embed_pdf_metadata_dpi:
          type: integer
          maximum: 250
          minimum: 50
          title: Embed Pdf Metadata Dpi
          description: >-
            Render DPI used when rasterizing the source PDF before embedding the
            OCR text layer. Lower values produce dramatically smaller output
            PDFs; higher values preserve more detail when zoomed past 200%.
            Defaults to 100 (good for on-screen viewing); raise toward the
            source scan DPI for crisper output. Min 50, max 250.
          default: 100
        detect_signatures:
          type: boolean
          title: Detect Signatures
          description: If True, detect signatures in the document. Defaults to False.
          default: false
        danger_filter_wide_boxes:
          type: boolean
          title: Danger Filter Wide Boxes
          description: >-
            You probably shouldn't use this. If True, filter out boxes with
            width greater than 50% of the document width. Defaults to False. You
            probably don't want to use this.
          default: false
        user_specified_timeout_seconds:
          anyOf:
            - type: number
            - type: 'null'
          title: User Specified Timeout Seconds
          description: A user specified timeout, defaults to None
      additionalProperties: true
      type: object
      title: ExperimentalProcessingOptions
    ArrayExtractConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: >-
            Array extraction allows you to extract long lists of information
            from lengthy documents. It makes parallel calls on overlapping
            sections of the document.
          default: false
        mode:
          type: string
          enum:
            - auto
            - legacy
            - no_overlap
          title: Mode
          description: The array extraction version to use.
          default: legacy
        pages_per_segment:
          type: integer
          title: Pages Per Segment
          description: >-
            Length of each segment, in pages, for parallel calls with array
            extraction.
          default: 10
      type: object
      title: ArrayExtractConfig
    AdvancedCitationsConfig:
      properties:
        numerical_confidence:
          type: boolean
          title: Numerical Confidence
          description: >-
            If True, enable numeric citation confidence scores. Defaults to
            False.
          default: false
        parent_block:
          $ref: '#/components/schemas/CitationParentBlockMode'
          description: >-
            How much of the source parse block to embed on each citation's
            parentBlock. 'full' (default) embeds the verbatim source-block HTML
            in parentBlock.content. 'bbox_only' suppresses parentBlock.content
            (returned as an empty string) while keeping parentBlock.bbox and all
            citation-level fields — this can drastically shrink responses on
            table-heavy schemas where the same source block is cited many times.
          default: full
      type: object
      title: AdvancedCitationsConfig
    AgentExtractConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: If agent extraction should be used for extraction.
          default: false
      type: object
      title: AgentExtractConfig
    UploadResponse:
      properties:
        file_id:
          type: string
          title: File Id
        presigned_url:
          anyOf:
            - type: string
            - type: 'null'
          title: Presigned Url
      type: object
      required:
        - file_id
      title: UploadResponse
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    ChunkingConfig:
      properties:
        chunk_mode:
          type: string
          enum:
            - variable
            - section
            - page
            - block
            - disabled
            - page_sections
          title: Chunk Mode
          description: >-
            Choose how to partition chunks. Variable mode chunks by character
            length and visual context. Section mode chunks by section headers.
            Page mode chunks according to pages. Page sections mode chunks first
            by page, then by sections within each page. Disabled returns one
            single chunk.
          default: variable
        chunk_size:
          type: integer
          title: Chunk Size
          description: >-
            The approximate size of chunks (in characters) that the document
            will be split into. Defaults to None, in which case the chunk size
            is variable between 250 - 1500 characters.
        chunk_overlap:
          type: integer
          title: Chunk Overlap
          description: >-
            Number of characters of overlap to include from adjacent chunks.
            Defaults to 0.
          default: 0
      type: object
      title: ChunkingConfig
    TableSummaryConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: If table summarization should be performed.
          default: false
        prompt:
          type: string
          title: Prompt
          description: Add information to the prompt for table summarization.
      type: object
      title: TableSummaryConfig
    FigureSummaryConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: If figure summarization should be performed.
          default: false
        prompt:
          type: string
          title: Prompt
          description: >-
            Add information to the prompt for figure summarization. Note any
            visual cues that should be incorporated. Example: 'When provided a
            diagram, extract all of the figure content verbatim.'
        override:
          type: boolean
          title: Override
          description: If the figure summary prompt should override our default prompt.
          default: false
        advanced_chart_agent:
          type: boolean
          title: Advanced Chart Agent
          description: If True, use the advanced chart agent. Defaults to False.
          default: false
      type: object
      title: FigureSummaryConfig
    PageRange:
      properties:
        start:
          anyOf:
            - type: integer
            - type: 'null'
          title: Start
          description: The page number to start processing from (1-indexed).
        end:
          anyOf:
            - type: integer
            - type: 'null'
          title: End
          description: The page number to stop processing at (1-indexed).
      type: object
      title: PageRange
    LargeTableChunkingConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: >-
            If large tables should be chunked into smaller tables, currently
            only supported on spreadsheet and CSV files.
          default: true
        size:
          type: integer
          title: Size
          description: >-
            The max row/column size for a table to be chunked. Defaults to 50.
            Header rows/columns are persisted based on heuristics.
          default: 50
      type: object
      title: LargeTableChunkingConfig
    EnrichConfig:
      properties:
        enabled:
          type: boolean
          title: Enabled
          description: >-
            If enabled, a large language/vision model will be used to
            postprocess the extracted content. Note: enabling enrich requires
            tables be outputted in markdown format. Defaults to False.
          default: false
        mode:
          type: string
          enum:
            - standard
            - page
            - table
            - table_auto
          title: Mode
          description: The mode to use for enrichment. Defaults to standard
          default: standard
        prompt:
          type: string
          title: Prompt
          description: Add information to the prompt for enrichment.
      type: object
      title: EnrichConfig
    CitationParentBlockMode:
      type: string
      enum:
        - full
        - bbox_only
      title: CitationParentBlockMode
      description: >-
        How much of the source parse block to embed on each citation's
        ``parentBlock``.


        ``FULL`` embeds the verbatim source-block HTML in
        ``parentBlock.content``.

        ``BBOX_ONLY`` returns ``parentBlock.content`` as an empty string while

        keeping ``parentBlock.bbox`` and all citation-level fields — on
        table-heavy

        schemas the same source block is re-embedded into every citation that

        points into it, so the content can dominate the response (90%+ of bytes)

        while being redundant with the parse output.
  securitySchemes:
    SkippableHTTPBearer:
      type: http
      scheme: bearer

````