Segment Document - Datalab Documentation

POST

api

segment

Segment Document

import requests

url = "https://www.datalab.to/api/v1/segment"

files = { "file.0": ("example-file", open("example-file", "rb")) }
payload = {
    "segmentation_schema": "<string>",
    "file_url": "<string>",
    "checkpoint_id": "<string>",
    "mode": "fast",
    "max_pages": "123",
    "page_range": "<string>",
    "save_checkpoint": "false",
    "skip_cache": "false",
    "webhook_url": "<string>",
    "processing_location": "<string>",
    "workflowstepdata_id": "123",
    "model_override_settings": "<string>",
    "file": "<string>"
}
headers = {"X-API-Key": "<api-key>"}

response = requests.post(url, data=payload, files=files, headers=headers)

print(response.text)

curl --request POST \
  --url https://www.datalab.to/api/v1/segment \
  --header 'Content-Type: multipart/form-data' \
  --header 'X-API-Key: <api-key>' \
  --form 'segmentation_schema=<string>' \
  --form 'file_url=<string>' \
  --form 'checkpoint_id=<string>' \
  --form mode=fast \
  --form max_pages=123 \
  --form 'page_range=<string>' \
  --form save_checkpoint=false \
  --form skip_cache=false \
  --form 'webhook_url=<string>' \
  --form 'processing_location=<string>' \
  --form workflowstepdata_id=123 \
  --form 'model_override_settings=<string>' \
  --form 'file=<string>' \
  --form file.0='@example-file'

const form = new FormData();
form.append('segmentation_schema', '<string>');
form.append('file_url', '<string>');
form.append('checkpoint_id', '<string>');
form.append('mode', 'fast');
form.append('max_pages', '123');
form.append('page_range', '<string>');
form.append('save_checkpoint', 'false');
form.append('skip_cache', 'false');
form.append('webhook_url', '<string>');
form.append('processing_location', '<string>');
form.append('workflowstepdata_id', '123');
form.append('model_override_settings', '<string>');
form.append('file', '<string>');
form.append('file.0', '{
  "fileName": "example-file"
}');

const options = {method: 'POST', headers: {'X-API-Key': '<api-key>'}};

options.body = form;

fetch('https://www.datalab.to/api/v1/segment', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://www.datalab.to/api/v1/segment"

	payload := strings.NewReader("-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"segmentation_schema\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file_url\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"checkpoint_id\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"mode\"\r\n\r\nfast\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"max_pages\"\r\n\r\n123\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"page_range\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"save_checkpoint\"\r\n\r\nfalse\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"skip_cache\"\r\n\r\nfalse\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"webhook_url\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"processing_location\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"workflowstepdata_id\"\r\n\r\n123\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"model_override_settings\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file.0\"; filename=\"example-file\"\r\nContent-Type: application/octet-stream\r\n\r\n{\r\n  \"fileName\": \"example-file\"\r\n}\r\n-----011000010111000001101001--")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("X-API-Key", "<api-key>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

{
  "request_id": "<string>",
  "request_check_url": "<string>",
  "success": true,
  "error": "<string>",
  "versions": {}
}

{
  "detail": [
    {
      "loc": [
        "<string>"
      ],
      "msg": "<string>",
      "type": "<string>"
    }
  ]
}

Authorizations

X-API-Key

string

header

required

Cookies

wos-session

string

datalab_active_team

string

Body

multipart/form-data

segmentation_schema

string

required

JSON object (passed as a string) defining how to segment the document: {"segments": [{"name": ..., "description": ...}, ...], "segmentation_strategy": "custom"}. When 'segments' are provided they guide which sections to identify; for fully automatic document-boundary detection pass {"segmentation_strategy": "document_boundary"} with no segments. The same object (JSON-encoded) is used for segment steps in pipelines via settings.segmentation_schema.

file_url

string | null

Optional file URL. Provide either file/file_url or checkpoint_id.

checkpoint_id

string | null

Checkpoint ID from a previous /convert request (with save_checkpoint=true). Skips re-parsing when provided.

mode

string

default:fast

Output mode for parsing (only used when providing a file, not a checkpoint).

max_pages

integer | null

The maximum number of pages to process.

page_range

string | null

The page range to process, comma separated like 0,5-10,20.

save_checkpoint

boolean

default:false

Save a checkpoint after processing for future extraction/segmentation calls.

skip_cache

boolean

default:false

Skip the cache and re-run.

webhook_url

string | null

Optional webhook URL to call when the request is complete.

processing_location

string | null

Optional residency region override (e.g. us, eu). When provided, use file_url or direct-upload; multipart uploads are rejected. When omitted, the request uses the team's configured residency and profile.

workflowstepdata_id

integer | null

Optional workflow step data ID to associate with this request.

model_override_settings

string | null

file

file | null

Input PDF, word document, powerpoint, or image file, uploaded as multipart form data. Images must be png, jpg, or webp format.

Response

Successful Response

request_id

string

required

The ID of the request. This ID can be used to check the status of the request.

request_check_url

string

required

The URL to check the status of the request and get results.

success

boolean

default:true

Whether the request was successful.

error

string | null

If the request was not successful, this will contain an error message.

versions

A dictionary of the versions of the libraries used in the request.

Run Custom PipelineExecute a custom pipeline configuration. The pipeline_id must reference a completed custom pipeline ID or a template ID.

⌘I

Segment Document

import requests

url = "https://www.datalab.to/api/v1/segment"

files = { "file.0": ("example-file", open("example-file", "rb")) }
payload = {
    "segmentation_schema": "<string>",
    "file_url": "<string>",
    "checkpoint_id": "<string>",
    "mode": "fast",
    "max_pages": "123",
    "page_range": "<string>",
    "save_checkpoint": "false",
    "skip_cache": "false",
    "webhook_url": "<string>",
    "processing_location": "<string>",
    "workflowstepdata_id": "123",
    "model_override_settings": "<string>",
    "file": "<string>"
}
headers = {"X-API-Key": "<api-key>"}

response = requests.post(url, data=payload, files=files, headers=headers)

print(response.text)

curl --request POST \
  --url https://www.datalab.to/api/v1/segment \
  --header 'Content-Type: multipart/form-data' \
  --header 'X-API-Key: <api-key>' \
  --form 'segmentation_schema=<string>' \
  --form 'file_url=<string>' \
  --form 'checkpoint_id=<string>' \
  --form mode=fast \
  --form max_pages=123 \
  --form 'page_range=<string>' \
  --form save_checkpoint=false \
  --form skip_cache=false \
  --form 'webhook_url=<string>' \
  --form 'processing_location=<string>' \
  --form workflowstepdata_id=123 \
  --form 'model_override_settings=<string>' \
  --form 'file=<string>' \
  --form file.0='@example-file'

const form = new FormData();
form.append('segmentation_schema', '<string>');
form.append('file_url', '<string>');
form.append('checkpoint_id', '<string>');
form.append('mode', 'fast');
form.append('max_pages', '123');
form.append('page_range', '<string>');
form.append('save_checkpoint', 'false');
form.append('skip_cache', 'false');
form.append('webhook_url', '<string>');
form.append('processing_location', '<string>');
form.append('workflowstepdata_id', '123');
form.append('model_override_settings', '<string>');
form.append('file', '<string>');
form.append('file.0', '{
  "fileName": "example-file"
}');

const options = {method: 'POST', headers: {'X-API-Key': '<api-key>'}};

options.body = form;

fetch('https://www.datalab.to/api/v1/segment', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://www.datalab.to/api/v1/segment"

	payload := strings.NewReader("-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"segmentation_schema\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file_url\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"checkpoint_id\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"mode\"\r\n\r\nfast\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"max_pages\"\r\n\r\n123\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"page_range\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"save_checkpoint\"\r\n\r\nfalse\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"skip_cache\"\r\n\r\nfalse\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"webhook_url\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"processing_location\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"workflowstepdata_id\"\r\n\r\n123\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"model_override_settings\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file\"\r\n\r\n<string>\r\n-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"file.0\"; filename=\"example-file\"\r\nContent-Type: application/octet-stream\r\n\r\n{\r\n  \"fileName\": \"example-file\"\r\n}\r\n-----011000010111000001101001--")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("X-API-Key", "<api-key>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

{
  "request_id": "<string>",
  "request_check_url": "<string>",
  "success": true,
  "error": "<string>",
  "versions": {}
}

{
  "detail": [
    {
      "loc": [
        "<string>"
      ],
      "msg": "<string>",
      "type": "<string>"
    }
  ]
}