contentapi-examples/python/batch-scraper/main.py at main · stabem/contentapi-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Batch Scraper with ContentAPI

Extract content from up to 50 URLs in a single API call:
  - Submit URLs via the batch endpoint
  - Get clean, structured content for each URL
  - Export results as JSON, CSV, or markdown

Usage:
  export CONTENTAPI_KEY=sk_live_your_key_here
  pip install -r requirements.txt
  python main.py

Get a free API key at https://getcontentapi.com (5,000 requests/month)
"""

import os
import sys
import json
import csv
from datetime import datetime
from contentapi import ContentAPI

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

API_KEY = os.environ.get("CONTENTAPI_KEY")
if not API_KEY:
    print("Error: Set CONTENTAPI_KEY environment variable")
    print("Get a free key at https://getcontentapi.com")
    sys.exit(1)

# URLs to extract — batch endpoint supports up to 50 at once
URLS = [
    "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "https://en.wikipedia.org/wiki/TypeScript",
    "https://en.wikipedia.org/wiki/Rust_(programming_language)",
    "https://en.wikipedia.org/wiki/Go_(programming_language)",
    "https://en.wikipedia.org/wiki/JavaScript",
    "https://en.wikipedia.org/wiki/Java_(programming_language)",
    "https://en.wikipedia.org/wiki/C%2B%2B",
    "https://en.wikipedia.org/wiki/Swift_(programming_language)",
    "https://en.wikipedia.org/wiki/Kotlin_(programming_language)",
    "https://en.wikipedia.org/wiki/Ruby_(programming_language)",
    "https://en.wikipedia.org/wiki/PHP",
    "https://en.wikipedia.org/wiki/Scala_(programming_language)",
    "https://en.wikipedia.org/wiki/Elixir_(programming_language)",
    "https://en.wikipedia.org/wiki/Haskell_(programming_language)",
    "https://en.wikipedia.org/wiki/Clojure",
]

# Output format: "json", "csv", or "markdown"
OUTPUT_FORMAT = "json"
OUTPUT_FILE = f"batch_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

# ---------------------------------------------------------------------------
# Batch Extraction
# ---------------------------------------------------------------------------

def batch_extract(urls: list[str]) -> list[dict]:
    """
    Extract content from multiple URLs in a single batch API call.

    The batch endpoint is more efficient than individual calls:
    - Single API call for up to 50 URLs
    - Parallel processing on the server
    - Counts as one request per URL in your quota
    """
    client = ContentAPI(api_key=API_KEY)

    print(f"📦 Submitting batch of {len(urls)} URLs...")
    print()

    try:
        results = client.web.extract_batch(urls=urls)

        successful = []
        failed = []

        for result in results:
            url = result.get("url", "unknown")

            if result.get("error"):
                failed.append({"url": url, "error": result["error"]})
                print(f"  ❌ {url}: {result['error']}")
            else:
                title = result.get("title", "Untitled")
                content = result.get("content", "")
                word_count = len(content.split()) if content else 0

                successful.append({
                    "url": url,
                    "title": title,
                    "content": content,
                    "word_count": word_count,
                    "description": result.get("description", ""),
                    "language": result.get("language", ""),
                })
                print(f"  ✅ {title} ({word_count:,} words)")

        print(f"\n📊 Results: {len(successful)} successful, {len(failed)} failed")
        return successful

    except Exception as e:
        print(f"❌ Batch extraction failed: {e}")
        return []


# ---------------------------------------------------------------------------
# Export Functions
# ---------------------------------------------------------------------------

def export_json(results: list[dict], filename: str):
    """Export results as a JSON file."""
    filepath = f"{filename}.json"

    output = {
        "extracted_at": datetime.now().isoformat(),
        "total_urls": len(results),
        "total_words": sum(r["word_count"] for r in results),
        "results": results,
    }

    with open(filepath, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Saved to {filepath}")
    return filepath


def export_csv(results: list[dict], filename: str):
    """Export results as a CSV file."""
    filepath = f"{filename}.csv"

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["url", "title", "word_count", "description", "language", "content"])
        writer.writeheader()
        writer.writerows(results)

    print(f"\n💾 Saved to {filepath}")
    return filepath


def export_markdown(results: list[dict], filename: str):
    """Export results as a Markdown file."""
    filepath = f"{filename}.md"

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"# Batch Extraction Results\n\n")
        f.write(f"Extracted at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write(f"Total URLs: {len(results)} | Total words: {sum(r['word_count'] for r in results):,}\n\n")
        f.write("---\n\n")

        for i, result in enumerate(results, 1):
            f.write(f"## {i}. {result['title']}\n\n")
            f.write(f"**URL:** {result['url']}  \n")
            f.write(f"**Words:** {result['word_count']:,}  \n")
            f.write(f"**Language:** {result.get('language', 'N/A')}  \n\n")

            if result.get("description"):
                f.write(f"> {result['description']}\n\n")

            # Include first 500 chars of content as preview
            content = result["content"]
            preview = content[:500] + "..." if len(content) > 500 else content
            f.write(f"{preview}\n\n")
            f.write("---\n\n")

    print(f"\n💾 Saved to {filepath}")
    return filepath


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    print("🚀 Batch Scraper — Powered by ContentAPI")
    print("=" * 60)
    print(f"URLs to extract: {len(URLS)}")
    print(f"Output format: {OUTPUT_FORMAT}")
    print("=" * 60)
    print()

    # Extract all URLs in batch
    results = batch_extract(URLS)

    if not results:
        print("No results to export.")
        sys.exit(1)

    # Export results
    exporters = {
        "json": export_json,
        "csv": export_csv,
        "markdown": export_markdown,
    }

    exporter = exporters.get(OUTPUT_FORMAT, export_json)
    filepath = exporter(results, OUTPUT_FILE)

    # Print summary
    total_words = sum(r["word_count"] for r in results)
    print(f"\n" + "=" * 60)
    print(f"✅ Batch extraction complete!")
    print(f"   URLs extracted: {len(results)}")
    print(f"   Total words: {total_words:,}")
    print(f"   Output: {filepath}")
    print(f"=" * 60)


if __name__ == "__main__":
    main()