Create Crawler Job

Create crawler job

curl --request POST \
  --url https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "https://example.com"
  ],
  "crawl": true,
  "crawlOptions": {
    "maxPages": 25,
    "urlMatchers": [
      "/"
    ],
    "stayOnDomain": true
  },
  "webhook": {
    "url": "https://example.com/webhooks/crawler",
    "events": [
      "page_scraped",
      "job_completed",
      "job_failed"
    ]
  }
}
'

import requests

url = "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs"

payload = {
    "urls": ["https://example.com"],
    "crawl": True,
    "crawlOptions": {
        "maxPages": 25,
        "urlMatchers": ["/"],
        "stayOnDomain": True
    },
    "webhook": {
        "url": "https://example.com/webhooks/crawler",
        "events": ["page_scraped", "job_completed", "job_failed"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['https://example.com'],
    crawl: true,
    crawlOptions: {maxPages: 25, urlMatchers: ['/'], stayOnDomain: true},
    webhook: {
      url: 'https://example.com/webhooks/crawler',
      events: ['page_scraped', 'job_completed', 'job_failed']
    }
  })
};

fetch('https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        'https://example.com'
    ],
    'crawl' => true,
    'crawlOptions' => [
        'maxPages' => 25,
        'urlMatchers' => [
                '/'
        ],
        'stayOnDomain' => true
    ],
    'webhook' => [
        'url' => 'https://example.com/webhooks/crawler',
        'events' => [
                'page_scraped',
                'job_completed',
                'job_failed'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "success": true,
  "message": "<string>",
  "data": {
    "id": "<string>",
    "workspaceId": "<string>",
    "primaryUrl": "<string>",
    "urls": [
      "<string>"
    ],
    "crawl": true,
    "crawlOptions": {
      "maxPages": 250,
      "urlMatchers": [
        "<string>"
      ],
      "unMatchers": [
        "<string>"
      ],
      "stayOnDomain": true
    },
    "useProxy": true,
    "deep": true,
    "refreshRate": "<string>",
    "toAgentId": "<string>",
    "toAgentIds": [
      "<string>"
    ],
    "done": true,
    "failed": true,
    "isCancelled": true,
    "message": "<string>",
    "resultError": "<string>",
    "createdAt": "<string>",
    "ts": 123,
    "currentPageIndex": 123,
    "scrapedPagesNum": 123,
    "failedPagesNum": 123,
    "pageLimit": 123,
    "creditsPerPage": 123,
    "estimatedCredits": 123,
    "activeScrapeUrl": "<string>",
    "crawlerJobId": "<string>",
    "webhook": {
      "url": "<string>",
      "events": [],
      "hasSecret": true,
      "hasBearerToken": true,
      "headerKeys": [
        "<string>"
      ]
    }
  }
}

{
  "message": "<string>",
  "code": "<string>",
  "issues": [
    {
      "message": "<string>"
    }
  ]
}

POST

workspaces

{workspaceId}

crawler

jobs

Create crawler job

curl --request POST \
  --url https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "urls": [
    "https://example.com"
  ],
  "crawl": true,
  "crawlOptions": {
    "maxPages": 25,
    "urlMatchers": [
      "/"
    ],
    "stayOnDomain": true
  },
  "webhook": {
    "url": "https://example.com/webhooks/crawler",
    "events": [
      "page_scraped",
      "job_completed",
      "job_failed"
    ]
  }
}
'

import requests

url = "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs"

payload = {
    "urls": ["https://example.com"],
    "crawl": True,
    "crawlOptions": {
        "maxPages": 25,
        "urlMatchers": ["/"],
        "stayOnDomain": True
    },
    "webhook": {
        "url": "https://example.com/webhooks/crawler",
        "events": ["page_scraped", "job_completed", "job_failed"]
    }
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: 'Bearer <token>', 'Content-Type': 'application/json'},
  body: JSON.stringify({
    urls: ['https://example.com'],
    crawl: true,
    crawlOptions: {maxPages: 25, urlMatchers: ['/'], stayOnDomain: true},
    webhook: {
      url: 'https://example.com/webhooks/crawler',
      events: ['page_scraped', 'job_completed', 'job_failed']
    }
  })
};

fetch('https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'urls' => [
        'https://example.com'
    ],
    'crawl' => true,
    'crawlOptions' => [
        'maxPages' => 25,
        'urlMatchers' => [
                '/'
        ],
        'stayOnDomain' => true
    ],
    'webhook' => [
        'url' => 'https://example.com/webhooks/crawler',
        'events' => [
                'page_scraped',
                'job_completed',
                'job_failed'
        ]
    ]
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: Bearer <token>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs"

	payload := strings.NewReader("{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "Bearer <token>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://eu-gcp-api.vg-stuff.com/v3/workspaces/{workspaceId}/crawler/jobs")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = 'Bearer <token>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"urls\": [\n    \"https://example.com\"\n  ],\n  \"crawl\": true,\n  \"crawlOptions\": {\n    \"maxPages\": 25,\n    \"urlMatchers\": [\n      \"/\"\n    ],\n    \"stayOnDomain\": true\n  },\n  \"webhook\": {\n    \"url\": \"https://example.com/webhooks/crawler\",\n    \"events\": [\n      \"page_scraped\",\n      \"job_completed\",\n      \"job_failed\"\n    ]\n  }\n}"

response = http.request(request)
puts response.read_body

{
  "success": true,
  "message": "<string>",
  "data": {
    "id": "<string>",
    "workspaceId": "<string>",
    "primaryUrl": "<string>",
    "urls": [
      "<string>"
    ],
    "crawl": true,
    "crawlOptions": {
      "maxPages": 250,
      "urlMatchers": [
        "<string>"
      ],
      "unMatchers": [
        "<string>"
      ],
      "stayOnDomain": true
    },
    "useProxy": true,
    "deep": true,
    "refreshRate": "<string>",
    "toAgentId": "<string>",
    "toAgentIds": [
      "<string>"
    ],
    "done": true,
    "failed": true,
    "isCancelled": true,
    "message": "<string>",
    "resultError": "<string>",
    "createdAt": "<string>",
    "ts": 123,
    "currentPageIndex": 123,
    "scrapedPagesNum": 123,
    "failedPagesNum": 123,
    "pageLimit": 123,
    "creditsPerPage": 123,
    "estimatedCredits": 123,
    "activeScrapeUrl": "<string>",
    "crawlerJobId": "<string>",
    "webhook": {
      "url": "<string>",
      "events": [],
      "hasSecret": true,
      "hasBearerToken": true,
      "headerKeys": [
        "<string>"
      ]
    }
  }
}

{
  "message": "<string>",
  "code": "<string>",
  "issues": [
    {
      "message": "<string>"
    }
  ]
}

Overview

Creates a new crawler job for a workspace and starts processing it in the background.

Crawler jobs are immutable after creation. If you need different settings, delete the job and create a new one.

Supports

Single-page scrape jobs
Multi-URL scrape jobs
Crawl jobs with crawlOptions
Optional outbound webhooks for page_scraped, job_completed, and job_failed

Billing

Credits are estimated at submission time and actually consumed per successful scraped page.

Use useProxy: true only when needed. Proxy scraping has higher per-page credit cost.

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Path Parameters

workspaceId

string

required

The workspace that owns the crawler job.

Body

application/json

urls

string<uri>[]

required

One or more source URLs to scrape or use as crawl entry points.

Minimum array length: 1

crawl

boolean

If true, discovered URLs can be followed and scraped as part of the same job.

crawlOptions

object

Show child attributes

deep

boolean

If true, use deep scraping behavior.

useProxy

boolean

If true, the crawler uses proxy scraping and paid proxy pricing.

refreshRate

string

Optional refresh cadence for KB-linked scrapes.

toAgentId

string

Optional single agent destination for KB import.

toAgentIds

string[]

Optional list of agent destinations for KB import.

webhook

object

Optional outbound webhook that receives page_scraped, job_completed, and job_failed events.

Show child attributes

Response

Successful response

success

boolean

required

message

string

required

data

object

required

Show child attributes

Get All Metrics Data List Crawler Jobs

​Overview

​Supports

​Billing

Authorizations

Path Parameters

Body

Response

Overview

Supports

Billing