> ## Documentation Index
> Fetch the complete documentation index at: https://documentation.datalab.to/llms.txt
> Use this file to discover all available pages before exploring further.

# Extract Structured Data

> Extract structured data from a document using a JSON schema. Provide a file for end-to-end processing, or a checkpoint_id from a previous /convert call to skip re-parsing.



## OpenAPI

````yaml https://www.datalab.to/openapi.json post /api/v1/extract
openapi: 3.1.0
info:
  title: Datalab API
  version: 0.0.1
servers:
  - url: https://www.datalab.to
    description: Datalab API
security: []
paths:
  /api/v1/extract:
    post:
      summary: Extract Structured Data
      description: >-
        Extract structured data from a document using a JSON schema. Provide a
        file for end-to-end processing, or a checkpoint_id from a previous
        /convert call to skip re-parsing.
      operationId: extract_api_v1_extract_post
      parameters:
        - name: wos-session
          in: cookie
          required: false
          schema:
            type: string
            title: Wos-Session
        - name: datalab_active_team
          in: cookie
          required: false
          schema:
            type: string
            title: Datalab Active Team
      requestBody:
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/Body_extract_api_v1_extract_post'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/InitialResponse'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
      security:
        - APIKeyHeader: []
components:
  schemas:
    Body_extract_api_v1_extract_post:
      properties:
        file_url:
          anyOf:
            - type: string
            - type: 'null'
          title: File Url
          description: Optional file URL. Provide either file/file_url or checkpoint_id.
        checkpoint_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Checkpoint Id
          description: >-
            Checkpoint ID from a previous /convert request (with
            save_checkpoint=true). Skips re-parsing when provided.
        page_schema:
          anyOf:
            - type: string
            - type: 'null'
          title: Page Schema
          description: >-
            The JSON schema for structured extraction. Generate with Pydantic
            .model_dump_json() or write manually. Must contain a 'properties'
            key. Mutually exclusive with schema_id.
          dashboard:
            description: >-
              Valid Pydantic JSON schema defining the structured data to
              extract. Generate with YourModel.model_json_schema().
            type: json
        schema_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Schema Id
          description: >-
            ID of a saved extraction schema (e.g. sch_k8Hx9mP2nQ4v). Mutually
            exclusive with page_schema.
        schema_version:
          anyOf:
            - type: integer
            - type: 'null'
          title: Schema Version
          description: >-
            Version number of the schema to use. Only valid with schema_id. If
            omitted, the latest version is used.
        mode:
          type: string
          title: Mode
          description: >-
            Output mode for parsing (only used when providing a file, not a
            checkpoint).
          default: fast
          choices:
            - fast
            - balanced
            - accurate
          dashboard:
            description: Processing mode balancing speed and accuracy.
        max_pages:
          anyOf:
            - type: integer
            - type: 'null'
          title: Max Pages
          description: The maximum number of pages to process.
        page_range:
          anyOf:
            - type: string
            - type: 'null'
          title: Page Range
          description: The page range to process, comma separated like 0,5-10,20.
          dashboard:
            description: >-
              Comma-separated page ranges to process, e.g. '0-2,4'. Leave empty
              for all pages.
        output_format:
          anyOf:
            - type: string
            - type: 'null'
          title: Output Format
          description: >-
            The output format for the parsed text alongside extraction results.
            Defaults to 'markdown'.
          dashboard:
            choices:
              - markdown
              - html
              - json
            description: Output format for the parsed text alongside extraction results.
            type: select
        save_checkpoint:
          type: boolean
          title: Save Checkpoint
          description: >-
            Save a checkpoint after processing for future
            extraction/segmentation calls.
          default: false
          dashboard:
            description: Save a checkpoint for later /extract or /segment calls.
        skip_cache:
          type: boolean
          title: Skip Cache
          description: Skip the cache and re-run.
          default: false
          dashboard:
            description: Skip cache and re-run processing.
        webhook_url:
          anyOf:
            - type: string
            - type: 'null'
          title: Webhook Url
          description: Optional webhook URL to call when the request is complete.
        processing_location:
          anyOf:
            - type: string
            - type: 'null'
          title: Processing Location
          description: >-
            Optional residency region override (e.g. us, eu). When provided, use
            file_url or direct-upload; multipart uploads are rejected. When
            omitted, the request uses the team's configured residency and
            profile.
        workflowstepdata_id:
          anyOf:
            - type: integer
            - type: 'null'
          title: Workflowstepdata Id
          description: Optional workflow step data ID to associate with this request.
        extraction_mode:
          anyOf:
            - type: string
            - type: 'null'
          title: Extraction Mode
          description: >-
            Extraction mode: 'fast' (lowest latency and cost) or 'balanced'
            (higher accuracy with per-field verification, reasoning, and
            citations; slower). Defaults to 'balanced'.
        model_override_settings:
          anyOf:
            - type: string
            - type: 'null'
          title: Model Override Settings
        file:
          anyOf:
            - type: string
              format: binary
            - type: 'null'
          title: File
          description: >-
            Input PDF, word document, powerpoint, or image file, uploaded as
            multipart form data.  Images must be png, jpg, or webp format.
      type: object
      title: Body_extract_api_v1_extract_post
    InitialResponse:
      properties:
        success:
          type: boolean
          title: Success
          description: Whether the request was successful.
          default: true
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: >-
            If the request was not successful, this will contain an error
            message.
        request_id:
          type: string
          title: Request Id
          description: >-
            The ID of the request. This ID can be used to check the status of
            the request.
        request_check_url:
          type: string
          title: Request Check Url
          description: The URL to check the status of the request and get results.
        versions:
          anyOf:
            - additionalProperties: true
              type: object
            - type: string
            - type: 'null'
          title: Versions
          description: A dictionary of the versions of the libraries used in the request.
      type: object
      required:
        - request_id
        - request_check_url
      title: InitialResponse
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
  securitySchemes:
    APIKeyHeader:
      type: apiKey
      in: header
      name: X-API-Key

````