📦 EqualifyEverything / crawler

📄 rosevelt.py · 73 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73import sys
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
from logger.config import logger
from database.select import next_rosevelt_url
from database.insert import record_new_crawled_htmls, create_new_crawl_html
from urllib.parse import urljoin, urlparse


def crawl_webpage(url_id, url, domain_id):
    # Create new crawl
    agent = "rosevelt"
    crawl_type = "html"
    crawl_id = create_new_crawl_html(crawl_type, url_id, agent)

    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching URL: {url} - {str(e)}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    new_urls = []

    for link in soup.find_all("a", href=True):
        found_url = link["href"]
        parsed_url = urlparse(found_url)
        if not parsed_url.scheme:  # Ignore if it's a relative URL
            found_url = urljoin(url, found_url)

        new_urls.append({
            "source_url_id": url_id,
            "url": found_url,
            "crawl_id": crawl_id
        })
    # Log the number of new URLs found
    logger.info(f"Found {len(new_urls)} new urls")

    # Record found urls with: record_new_crawled_urls(new_urls)
    record_new_crawled_htmls(new_urls)


def main():
    while True:
        next_url = next_rosevelt_url()
        if not next_url:
            break

        url_id, url, domain_id = next_url
        crawl_webpage(url_id, url, domain_id)


def run_rosevelt_main():
    # Run Rosevelt main
    main()


if __name__ == "__main__":
    num_processes = 10
    processes = []

    # Spawn multiple processes
    for i in range(num_processes):
        process = Process(target=run_rosevelt_main)
        processes.append(process)
        process.start()

    # Wait for all processes to finish
    for process in processes:
        process.join()