Scrape Structured Web Data

Extract page titles, headings, links, and content from web pages into structured JSON for data collection workflows.

Who this is for

Data teams and content aggregation platforms use this recipe to extract structured information from web pages. Point the API at any URL and receive structured JSON with page title, headings, and links — ready for indexing, monitoring, or content analysis pipelines.

curl -X POST https://api.iterationlayer.com/document-extraction/v1/extract \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "files": [
      {
        "type": "url",
        "name": "page.html",
        "url": "https://example.com/products/widget-pro"
      }
    ],
    "schema": {
      "fields": [
        { "name": "page_title", "type": "TEXT", "description": "Title of the web page" },
        { "name": "headings", "type": "ARRAY", "description": "List of headings on the page", "fields": [
          { "name": "text", "type": "TEXT", "description": "Heading text" }
        ]},
        { "name": "links", "type": "ARRAY", "description": "List of links on the page", "fields": [
          { "name": "text", "type": "TEXT", "description": "Link anchor text" },
          { "name": "url", "type": "TEXT", "description": "Link URL" }
        ]}
      ]
    }
  }'
import { IterationLayer } from "iterationlayer";
const client = new IterationLayer({ apiKey: "YOUR_API_KEY" });

const result = await client.extract({
  files: [
    {
      type: "url",
      name: "page.html",
      url: "https://example.com/products/widget-pro",
    },
  ],
  schema: {
    fields: [
      { name: "page_title", type: "TEXT", description: "Title of the web page" },
      { name: "headings", type: "ARRAY", description: "List of headings on the page", fields: [
        { name: "text", type: "TEXT", description: "Heading text" },
      ]},
      { name: "links", type: "ARRAY", description: "List of links on the page", fields: [
        { name: "text", type: "TEXT", description: "Link anchor text" },
        { name: "url", type: "TEXT", description: "Link URL" },
      ]},
    ],
  },
});

console.log(result);
from iterationlayer import IterationLayer
client = IterationLayer(api_key="YOUR_API_KEY")

result = client.extract(
    files=[
        {
            "type": "url",
            "name": "page.html",
            "url": "https://example.com/products/widget-pro",
        }
    ],
    schema={
        "fields": [
            {"name": "page_title", "type": "TEXT", "description": "Title of the web page"},
            {"name": "headings", "type": "ARRAY", "description": "List of headings on the page", "fields": [
                {"name": "text", "type": "TEXT", "description": "Heading text"},
            ]},
            {"name": "links", "type": "ARRAY", "description": "List of links on the page", "fields": [
                {"name": "text", "type": "TEXT", "description": "Link anchor text"},
                {"name": "url", "type": "TEXT", "description": "Link URL"},
            ]},
        ]
    },
)

print(result)
package main

import il "github.com/iterationlayer/sdk-go"

client := il.NewClient("YOUR_API_KEY")

result, err := client.Extract(il.ExtractRequest{
    Files: []il.FileInput{
        il.NewFileFromURL("page.html", "https://example.com/products/widget-pro"),
    },
    Schema: il.ExtractionSchema{
        "page_title": il.NewTextFieldConfig("page_title", "Title of the web page"),
        "headings": il.NewArrayFieldConfig("headings", "List of headings on the page", []il.FieldConfig{
            il.NewTextFieldConfig("text", "Heading text"),
        }),
        "links": il.NewArrayFieldConfig("links", "List of links on the page", []il.FieldConfig{
            il.NewTextFieldConfig("text", "Link anchor text"),
            il.NewTextFieldConfig("url", "Link URL"),
        }),
    },
})

Related Recipes

Start building in minutes

Free trial credits included. No credit card required.