📦 EqualifyEverything / crawler

📄 CartoCrawler.py · 150 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150import os
import scrapy
from scrapy.spiders import SitemapSpider
# Database Operations
from database.insert import record_crawled_url
from database.update import update_crawl_status, update_crawl_user_agent, update_crawl_complete
# Utils & Configs
from utils.make import get_headers, get_spidey_senses
from logger.config import logger

class CartoCrawler(SitemapSpider):
    name = "CartoCrawler"
    """
    A Scrapy spider for crawling sitemaps.

    Attributes
    ----------
    total_urls : int
        Total number of URLs crawled
    new_urls : int
        Number of new URLs found
    updated_urls : int
        Number of URLs updated

    Methods
    -------
    start_cartocrawler(crawl_uuid, start_url)
        Starts the CartoCrawler with the given crawl UUID and start URL.
    parse(response)
        Parses the given response and records the crawled URL.

    """
    total_urls = 0
    new_urls = 0
    updated_urls = 0

    def start_cartocrawler(self, crawl_uuid, start_url):
        """
        Starts the CartoCrawler with the given crawl UUID and start URL.

        Parameters
        ----------
        crawl_uuid : str
            The UUID of the crawl
        start_url : str
            The URL of the sitemap to start crawling
        """
        # Update Crawl Status to queued
        status = 'queued'
        update_crawl_status(status, crawl_uuid)
        logger.info(f'Spinning up CartoCrawler... ')

        # Get Headers
        spider = 'cartocrawler'
        headers, user_agent_id = get_headers(spider)
        logger.info('CartoCrawler headers set')

        # Define spider-specific settings
        spidey_senses = {
            'CONCURRENT_REQUESTS': 8,
            'DNSCACHE_ENABLED': False,
            'SITEMAP_FOLLOW': True,
            'SITEMAP_ALTERNATIVE_LINKS': True,
            'ROBOTS_TXT': False,
            'ROBOTSTXT_OBEY': False,
            'PRIORITY': 1,
            'MAX_URLS': 10000,
        }
        spidey_senses.update(get_spidey_senses(self))
        self.spider_settings = spidey_senses
        logger.info(f'{spider} senses defined')

        # Add User Agent to Crawl Info

        if update_crawl_user_agent(user_agent_id, crawl_uuid):
            logger.info(f'Crawl {crawl_uuid} User Agent Recorded')
        else:
            logger.error(f'Crawl {crawl_uuid} Failed to Record')

        # Set the starting URLs to crawl
        self.sitemap_urls = [start_url]
        logger.info(f'Starting URL set to {start_url}')

        # Roll to Parser
        for url in self.sitemap_urls:
            yield scrapy.Request(url, headers=headers, callback=self.parse)

        # Crawl is done, log the results
        logger.info(f'{self.total_urls} crawled, {self.new_urls} new URLs found, {self.updated_urls} URLs updated')
    # Set Vars for What's Next
        spider = 'cartocrawler'
        crawl_type = 'kraken'
        kraken_whats_next(self, spider, crawl_type)
    # End of Script

        # Sitemap Complete
        if update_crawl_complete(crawl_uuid, self.new_urls, self.updated_urls):
            logger.info(f'Crawl {crawl_uuid} successfully recorded')
        else:
            logger.error(f'Crawl {crawl_uuid} NOT RECORDED')


    def parse(self, response):
        """
        Parses the given response and records the crawled URL.

        Parameters:
        -----------
        response : scrapy.Response
            The response to parse.

        Returns:
        --------
        None
        """
        # Log Found URL
        logger.debug(f'Found URL: {response.url}')

        # Set variables for recording crawled url
        url = response.url
        crawl_uuid = self.crawl_uuid
        source_url = response.request.headers.get('Referer', None)

        # Record crawled URL
        action = record_crawled_url(response.url, self.crawl_uuid, response.request.url)

        # Update counters
        if action == 'add':
            self.new_urls += 1
        elif action == 'update':
            self.updated_urls += 1

    def kraken_whats_next(self):
        spider = 'cartocrawler'
        crawl_type = 'kraken'
        logger.info(f'Determining whats next for the {crawl_type}\'s {spider}... ')
        if crawl_type == 'kraken':
            # What should the Kraken do
            logger.info(f'This looks to be a {crawl_type}\'s {spider} ')
            if should_sitemap_continue():
                logger.info('More sitemaps for the {crawl_type}, lets go again!!! ')
                setup_kraken_cartocrawler()
            else:
                logger.info('No more sitemaps to crawl. Calling it a day... ')
        elif crawl_type == 'harpoon':
            logger.warning(f'Harpoon asking for help! See manager... ')
        # What to do if crawl_type doesn't match
        else:
            logger.error(f'I\'m lost. Help me! See my manager... ')