Extract Article Text

Extract clean article content — title, author, date, and body text — from PDFs, Word docs, and web pages.

Who this is for

Content aggregators and newsletter platforms use this recipe to extract clean article content from PDFs, Word documents, and saved web pages. Define fields for title, author, date, body, and summary — the parser pulls the content and ignores headers, footers, navigation, and sidebars.

Request
curl -X POST \
  https://api.iterationlayer.com/document-extraction/v1/extract \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "files": [
      {
        "type": "url",
        "name": "article.pdf",
        "url": "https://example.com/article.pdf"
      }
    ],
    "schema": {
      "fields": [
        {
          "name": "title",
          "type": "TEXT",
          "description": "Article title or headline",
          "is_required": true
        },
        {
          "name": "author",
          "type": "TEXT",
          "description": "Author name"
        },
        {
          "name": "publish_date",
          "type": "DATE",
          "description": "Publication date of the article"
        },
        {
          "name": "body",
          "type": "TEXTAREA",
          "description": "Main article text content, excluding headers, footers, sidebars, and navigation",
          "is_required": true
        },
        {
          "name": "summary",
          "type": "TEXT",
          "description": "Brief summary or abstract",
          "max_length": 500
        },
        {
          "name": "category",
          "type": "TEXT",
          "description": "Article category or section"
        }
      ]
    }
  }'
Response
{
  "success": true,
  "data": {
    "title": {
      "value": "The Quiet Revolution in Battery Chemistry",
      "confidence": 0.97,
      "citations": ["The Quiet Revolution in Battery Chemistry"],
      "source": "article.pdf"
    },
    "author": {
      "value": "James Park",
      "confidence": 0.94,
      "citations": ["James Park"],
      "source": "article.pdf"
    },
    "publish_date": {
      "value": "2026-01-15",
      "confidence": 0.93,
      "citations": ["January 15, 2026"],
      "source": "article.pdf"
    },
    "body": {
      "value": "Solid-state batteries have been five years away for the last twenty years. But the latest generation of prototypes from three independent labs suggests the timeline might finally be real...",
      "confidence": 0.91,
      "citations": ["Solid-state batteries have been five years away"],
      "source": "article.pdf"
    },
    "summary": {
      "value": "Recent advances in solid-state battery technology suggest commercial viability within three years, driven by breakthroughs in solid electrolyte materials.",
      "confidence": 0.88,
      "citations": ["solid-state battery technology"],
      "source": "article.pdf"
    }
  }
}
Request
import { IterationLayer } from "iterationlayer";
const client = new IterationLayer({ apiKey: "YOUR_API_KEY" });

const result = await client.extract({
  files: [
    {
      type: "url",
      name: "article.pdf",
      url: "https://example.com/article.pdf",
    },
  ],
  schema: {
    fields: [
      {
        name: "title",
        type: "TEXT",
        description: "Article title or headline",
        is_required: true,
      },
      {
        name: "author",
        type: "TEXT",
        description: "Author name",
      },
      {
        name: "publish_date",
        type: "DATE",
        description: "Publication date of the article",
      },
      {
        name: "body",
        type: "TEXTAREA",
        description: "Main article text content, excluding headers, footers, sidebars, and navigation",
        is_required: true,
      },
      {
        name: "summary",
        type: "TEXT",
        description: "Brief summary or abstract",
        max_length: 500,
      },
      {
        name: "category",
        type: "TEXT",
        description: "Article category or section",
      },
    ],
  },
});
Response
{
  "success": true,
  "data": {
    "title": {
      "value": "The Quiet Revolution in Battery Chemistry",
      "confidence": 0.97,
      "citations": ["The Quiet Revolution in Battery Chemistry"],
      "source": "article.pdf"
    },
    "author": {
      "value": "James Park",
      "confidence": 0.94,
      "citations": ["James Park"],
      "source": "article.pdf"
    },
    "publish_date": {
      "value": "2026-01-15",
      "confidence": 0.93,
      "citations": ["January 15, 2026"],
      "source": "article.pdf"
    },
    "body": {
      "value": "Solid-state batteries have been five years away for the last twenty years. But the latest generation of prototypes from three independent labs suggests the timeline might finally be real...",
      "confidence": 0.91,
      "citations": ["Solid-state batteries have been five years away"],
      "source": "article.pdf"
    },
    "summary": {
      "value": "Recent advances in solid-state battery technology suggest commercial viability within three years, driven by breakthroughs in solid electrolyte materials.",
      "confidence": 0.88,
      "citations": ["solid-state battery technology"],
      "source": "article.pdf"
    }
  }
}
Request
from iterationlayer import IterationLayer
client = IterationLayer(api_key="YOUR_API_KEY")

result = client.extract(
    files=[
        {
            "type": "url",
            "name": "article.pdf",
            "url": "https://example.com/article.pdf",
        }
    ],
    schema={
        "fields": [
            {
                "name": "title",
                "type": "TEXT",
                "description": "Article title or headline",
                "is_required": True,
            },
            {
                "name": "author",
                "type": "TEXT",
                "description": "Author name",
            },
            {
                "name": "publish_date",
                "type": "DATE",
                "description": "Publication date of the article",
            },
            {
                "name": "body",
                "type": "TEXTAREA",
                "description": "Main article text content, excluding headers, footers, sidebars, and navigation",
                "is_required": True,
            },
            {
                "name": "summary",
                "type": "TEXT",
                "description": "Brief summary or abstract",
                "max_length": 500,
            },
            {
                "name": "category",
                "type": "TEXT",
                "description": "Article category or section",
            },
        ]
    },
)
Response
{
  "success": true,
  "data": {
    "title": {
      "value": "The Quiet Revolution in Battery Chemistry",
      "confidence": 0.97,
      "citations": ["The Quiet Revolution in Battery Chemistry"],
      "source": "article.pdf"
    },
    "author": {
      "value": "James Park",
      "confidence": 0.94,
      "citations": ["James Park"],
      "source": "article.pdf"
    },
    "publish_date": {
      "value": "2026-01-15",
      "confidence": 0.93,
      "citations": ["January 15, 2026"],
      "source": "article.pdf"
    },
    "body": {
      "value": "Solid-state batteries have been five years away for the last twenty years. But the latest generation of prototypes from three independent labs suggests the timeline might finally be real...",
      "confidence": 0.91,
      "citations": ["Solid-state batteries have been five years away"],
      "source": "article.pdf"
    },
    "summary": {
      "value": "Recent advances in solid-state battery technology suggest commercial viability within three years, driven by breakthroughs in solid electrolyte materials.",
      "confidence": 0.88,
      "citations": ["solid-state battery technology"],
      "source": "article.pdf"
    }
  }
}
Request
package main

import il "github.com/iterationlayer/sdk-go"

func main() {
    client := il.NewClient("YOUR_API_KEY")

    result, err := client.Extract(il.ExtractRequest{
        Files: []il.FileInput{
            il.NewFileFromURL(
                "article.pdf",
                "https://example.com/article.pdf",
            ),
        },
        Schema: il.ExtractionSchema{
            "title": il.NewTextFieldConfig(
                "title",
                "Article title or headline",
            ),
            "author": il.NewTextFieldConfig(
                "author",
                "Author name",
            ),
            "publish_date": il.NewDateFieldConfig(
                "publish_date",
                "Publication date of the article",
            ),
            "body": il.NewTextareaFieldConfig(
                "body",
                "Main article text content, excluding headers, footers, sidebars, and navigation",
            ),
            "summary": il.NewTextFieldConfig(
                "summary",
                "Brief summary or abstract",
            ),
            "category": il.NewTextFieldConfig(
                "category",
                "Article category or section",
            ),
        },
    })
    if err != nil {
        panic(err)
    }
}
Response
{
  "success": true,
  "data": {
    "title": {
      "value": "The Quiet Revolution in Battery Chemistry",
      "confidence": 0.97,
      "citations": ["The Quiet Revolution in Battery Chemistry"],
      "source": "article.pdf"
    },
    "author": {
      "value": "James Park",
      "confidence": 0.94,
      "citations": ["James Park"],
      "source": "article.pdf"
    },
    "publish_date": {
      "value": "2026-01-15",
      "confidence": 0.93,
      "citations": ["January 15, 2026"],
      "source": "article.pdf"
    },
    "body": {
      "value": "Solid-state batteries have been five years away for the last twenty years. But the latest generation of prototypes from three independent labs suggests the timeline might finally be real...",
      "confidence": 0.91,
      "citations": ["Solid-state batteries have been five years away"],
      "source": "article.pdf"
    },
    "summary": {
      "value": "Recent advances in solid-state battery technology suggest commercial viability within three years, driven by breakthroughs in solid electrolyte materials.",
      "confidence": 0.88,
      "citations": ["solid-state battery technology"],
      "source": "article.pdf"
    }
  }
}

Related Recipes

Start building in minutes

Free trial credits included. No credit card required.