📦 EqualifyEverything / crawler

📄 CartoCrawler.py · 111 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111import os
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import SitemapSpider
# Database Operations
from database.insert import record_crawled_url
from database.update import update_crawl_status, update_crawl_user_agent, update_crawl_complete
# Utils & Configs
from utils.make import get_headers, get_spidey_senses
from logger.config import logger
from crawls.kraken import kraken_whats_next



def start_cartocrawler(crawl_uuid, start_url):
    carto_crawler = CartoCrawler(start_url=start_url, crawl_uuid=crawl_uuid)

    # Update Crawl Status to queued
    status = 'queued'
    update_crawl_status(status, crawl_uuid)
    logger.info(f'🕸️🕷️🗺️ Spinning up CartoCrawler... ')

    # Get Headers
    spider = 'cartocrawler'
    headers, user_agent_id = get_headers(spider)
    logger.info(f'🕸️🕷️🗺️ {spider} headers set')

    # Define spider-specific settings
    spidey_senses = {
        'CONCURRENT_REQUESTS': 8,
        'DNSCACHE_ENABLED': False,
        'SITEMAP_FOLLOW': True,
        'SITEMAP_ALTERNATIVE_LINKS': True,
        'ROBOTS_TXT': False,
        'ROBOTSTXT_OBEY': False,
        'PRIORITY': 1,
        'MAX_URLS': 10000,
    }
    spidey_senses.update(get_spidey_senses(carto_crawler))
    carto_crawler.spider_settings = spidey_senses
    logger.info(f'🕸️🕷️🗺️ {spider} senses defined')

    # Add User Agent to Crawl Info
    if update_crawl_user_agent(user_agent_id, crawl_uuid):
        logger.info(f'🕸️🕷️🗺️ Crawl {crawl_uuid} User Agent Recorded')
    else:
        logger.error(f'🕸️🕷️🗺️ Crawl {crawl_uuid} Failed to Record')

    # Set the starting URLs to crawl
    carto_crawler.sitemap_urls = [start_url]
    logger.info(f'🕸️🕷️🗺️ Starting URL set to {start_url}')

    # Create a CrawlerProcess
    process = CrawlerProcess(settings={
        'USER_AGENT': headers['User-Agent'],
        **carto_crawler.spider_settings
    })

    # Start the CrawlerProcess
    process.crawl(carto_crawler.__class__, start_url=start_url, crawl_uuid=crawl_uuid)
    process.start()

    # Crawl is done, log the results
    logger.info(f'🕸️🕷️🗺️ {carto_crawler.total_urls} crawled, {carto_crawler.new_urls} new URLs found, {carto_crawler.updated_urls} URLs updated')

    # Set Vars for What's Next
    crawl_type = 'kraken'
    kraken_whats_next(crawl_type)
    # End of Script



class CartoCrawler(SitemapSpider):
    name = "carto"

    def __init__(self, *args, start_url=None, crawl_uuid=None, **kwargs):
        super(CartoCrawler, self).__init__(*args, **kwargs)
        self.start_url = start_url
        self.crawl_uuid = crawl_uuid
        self.total_urls = 0
        self.new_urls = 0
        self.updated_urls = 0

        # Sitemap Complete
        if update_crawl_complete(crawl_uuid, self.new_urls, self.updated_urls):
            # Sitemap Complete
            if update_crawl_complete(crawl_uuid, self.new_urls, self.updated_urls):
                logger.info(f'Crawl {crawl_uuid} successfully recorded')
            else:
                logger.error(f'Crawl {crawl_uuid} NOT RECORDED')


    def parse(self, response):
        # Log Found URL
        logger.debug(f'Found URL: {response.url}')

        # Set variables for recording crawled url
        url = response.url
        crawl_uuid = self.crawl_uuid
        source_url = response.request.headers.get('Referer', None)

        # Record crawled URL
        action = record_crawled_url(response.url, self.crawl_uuid, response.request.url)

        # Update counters
        if action == 'add':
            self.new_urls += 1
        elif action == 'update':
            self.updated_urls += 1