How to Extract Structured Data from Web Pages with an API - Recipes

curlTypeScriptPythonGo

curl -X POST https://api.iterationlayer.com/document-extraction/v1/extract \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "files": [
      {
        "type": "url",
        "name": "page.html",
        "url": "https://example.com/products/widget-pro"
      }
    ],
    "schema": {
      "fields": [
        { "name": "page_title", "type": "TEXT", "description": "Title of the web page" },
        { "name": "headings", "type": "ARRAY", "description": "List of headings on the page", "fields": [
          { "name": "text", "type": "TEXT", "description": "Heading text" }
        ]},
        { "name": "links", "type": "ARRAY", "description": "List of links on the page", "fields": [
          { "name": "text", "type": "TEXT", "description": "Link anchor text" },
          { "name": "url", "type": "TEXT", "description": "Link URL" }
        ]}
      ]
    }
  }'

import { IterationLayer } from "iterationlayer";
const client = new IterationLayer({ apiKey: "YOUR_API_KEY" });

const result = await client.extract({
  files: [
    {
      type: "url",
      name: "page.html",
      url: "https://example.com/products/widget-pro",
    },
  ],
  schema: {
    fields: [
      { name: "page_title", type: "TEXT", description: "Title of the web page" },
      { name: "headings", type: "ARRAY", description: "List of headings on the page", fields: [
        { name: "text", type: "TEXT", description: "Heading text" },
      ]},
      { name: "links", type: "ARRAY", description: "List of links on the page", fields: [
        { name: "text", type: "TEXT", description: "Link anchor text" },
        { name: "url", type: "TEXT", description: "Link URL" },
      ]},
    ],
  },
});

console.log(result);

from iterationlayer import IterationLayer
client = IterationLayer(api_key="YOUR_API_KEY")

result = client.extract(
    files=[
        {
            "type": "url",
            "name": "page.html",
            "url": "https://example.com/products/widget-pro",
        }
    ],
    schema={
        "fields": [
            {"name": "page_title", "type": "TEXT", "description": "Title of the web page"},
            {"name": "headings", "type": "ARRAY", "description": "List of headings on the page", "fields": [
                {"name": "text", "type": "TEXT", "description": "Heading text"},
            ]},
            {"name": "links", "type": "ARRAY", "description": "List of links on the page", "fields": [
                {"name": "text", "type": "TEXT", "description": "Link anchor text"},
                {"name": "url", "type": "TEXT", "description": "Link URL"},
            ]},
        ]
    },
)

print(result)

package main

import il "github.com/iterationlayer/sdk-go"

client := il.NewClient("YOUR_API_KEY")

result, err := client.Extract(il.ExtractRequest{
    Files: []il.FileInput{
        il.NewFileFromURL("page.html", "https://example.com/products/widget-pro"),
    },
    Schema: il.ExtractionSchema{
        "page_title": il.NewTextFieldConfig("page_title", "Title of the web page"),
        "headings": il.NewArrayFieldConfig("headings", "List of headings on the page", []il.FieldConfig{
            il.NewTextFieldConfig("text", "Heading text"),
        }),
        "links": il.NewArrayFieldConfig("links", "List of links on the page", []il.FieldConfig{
            il.NewTextFieldConfig("text", "Link anchor text"),
            il.NewTextFieldConfig("url", "Link URL"),
        }),
    },
})

Ingest

Transform

Generate

Categories

Featured

Overview

APIs

Integrations

Scrape Structured Web Data

Who this is for

Related Recipes

Automate Invoice Processing

Book Production Pipeline

Content Aggregation Pipeline

Start building in minutes