In this example, we’ll demonstrate how to leverage our PDF Scraper API to extract PDF documents based on keywords.

Requirements

  • MrScraper console account.
  • MrScraper API token that you can get by following the steps here.

Example

In this example, we’ll scrape Architectural plan PDF from Google Search, after the scraping is complete, we can download the zipped PDF files.

Follow the steps below to use our PDF Scraper API:

  1. Use the request body below:
curl --location 'https://app.mrscraper.com/api/scrapers/pdf/create-and-run' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <token>' \
--data '{
    "name": "PDF Scraper",
    "expected_pdf": 200,
    "unique": true,
    "keywords": [
        {
            "keyword": "Architectural plan PDF",
            "prompt": "Return '\''true'\'' if the most of the image are architectural sketches, plan, blueprint, drawing. Return '\''false'\'' if most of the image are just text documents or other"
        }
    ]
}'
  1. Replace Authorization with your API token.
  2. The above request body will return the following JSON response:
{
    "message": "Scraping queued successfully",
    "scraper": {
        "id": 3669,
        "name": "PDF Scraper",
        "url": [
            "Default"
        ],
        "urls": [
            "Default"
        ],
        "scheduled": false,
        "schedule": null,
        "created_at": "2024-10-15T07:38:15.000000Z",
        "updated_at": "2024-10-15T07:38:15.000000Z"
    },
    "results": [
        {
            "scraping_run_id": 375965,
            "user_id": 5573,
            "scraper_name": "PDF Scraper",
            "scrapped_url": "Architectural plan PDF",
            "scraper_id": 3669,
            "status": "running",
            "updated_at": "2024-10-15T07:38:15.000000Z",
            "created_at": "2024-10-15T07:38:15.000000Z",
            "id": 1169956,
            "scraper": {
                "id": 3669,
                "user_id": 5573,
                "sharing": false,
                "share_uuid": null,
                "name": "Test PDF Scraper",
                "type": "pdf",
                "urls": "Default",
                "pdf_urls": "Default",
                "ai_prompt": null,
                "ai_scope": null,
                "headers": null,
                "cookies": null,
                "user_agent": null,
                "disabled_resources": null,
                "delay": 0,
                "html_wanted": false,
                "screenshot_wanted": false,
                "screenshot_type": null,
                "locale": null,
                "scheduled": false,
                "cron": null,
                "cron_timezone": "UTC",
                "paginate": 0,
                "pagination_type": null,
                "infinite_pagination_type": null,
                "infinite_pagination_seconds": null,
                "infinite_pagination_text": null,
                "infinite_pagination_css_selector": null,
                "infinite_pagination_n_selector": null,
                "load_more_selector": null,
                "pagination_query_parameter": null,
                "pagination_next_page_selector": null,
                "pagination_limit_type": null,
                "pagination_max_page": null,
                "max_next_page": null,
                "pagination_max_variable": null,
                "created_at": "2024-10-15T07:38:15.000000Z",
                "updated_at": "2024-10-15T07:38:15.000000Z",
                "cron_minutes": "*",
                "cron_minutes_n_detail": null,
                "cron_minutes_x_detail": null,
                "cron_hours": "*",
                "cron_hours_n_detail": null,
                "cron_hours_x_detail": null,
                "cron_day_of_month": "*",
                "cron_day_of_month_x_detail": null,
                "cron_month": "*",
                "cron_month_x_detail": null,
                "cron_day_of_week": "*",
                "cron_day_of_week_x_detail": null,
                "click_action_enabled": false,
                "click_action_selector": null,
                "click_action_wait": "no",
                "workflow": [
                    {
                        "type": "options",
                        "data": {
                            "keywords": [
                                {
                                    "type": "keyword",
                                    "data": {
                                        "keyword": "Architectural plan PDF",
                                        "prompt": "Return 'true' if the most of the image are architectural sketches, plan, blueprint, drawing. Return 'false' if most of the image are just text documents or other"
                                    }
                                }
                            ]
                        }
                    }
                ],
                "version": 2,
                "proxy_type": "rotation",
                "proxy_host": null,
                "proxy_port": null,
                "proxy_username": null,
                "proxy_password": null,
                "parsers": null,
                "deleted_at": null,
                "external_auth": null
            }
        }
    ]
}
To use this endpoint for your use case, please refer to this section.