1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150import os
import scrapy
from scrapy.spiders import SitemapSpider
# Database Operations
from database.insert import record_crawled_url
from database.update import update_crawl_status, update_crawl_user_agent, update_crawl_complete
# Utils & Configs
from utils.make import get_headers, get_spidey_senses
from logger.config import logger
class CartoCrawler(SitemapSpider):
name = "CartoCrawler"
"""
A Scrapy spider for crawling sitemaps.
Attributes
----------
total_urls : int
Total number of URLs crawled
new_urls : int
Number of new URLs found
updated_urls : int
Number of URLs updated
Methods
-------
start_cartocrawler(crawl_uuid, start_url)
Starts the CartoCrawler with the given crawl UUID and start URL.
parse(response)
Parses the given response and records the crawled URL.
"""
total_urls = 0
new_urls = 0
updated_urls = 0
def start_cartocrawler(self, crawl_uuid, start_url):
"""
Starts the CartoCrawler with the given crawl UUID and start URL.
Parameters
----------
crawl_uuid : str
The UUID of the crawl
start_url : str
The URL of the sitemap to start crawling
"""
# Update Crawl Status to queued
status = 'queued'
update_crawl_status(status, crawl_uuid)
logger.info(f'Spinning up CartoCrawler... ')
# Get Headers
spider = 'cartocrawler'
headers, user_agent_id = get_headers(spider)
logger.info('CartoCrawler headers set')
# Define spider-specific settings
spidey_senses = {
'CONCURRENT_REQUESTS': 8,
'DNSCACHE_ENABLED': False,
'SITEMAP_FOLLOW': True,
'SITEMAP_ALTERNATIVE_LINKS': True,
'ROBOTS_TXT': False,
'ROBOTSTXT_OBEY': False,
'PRIORITY': 1,
'MAX_URLS': 10000,
}
spidey_senses.update(get_spidey_senses(self))
self.spider_settings = spidey_senses
logger.info(f'{spider} senses defined')
# Add User Agent to Crawl Info
if update_crawl_user_agent(user_agent_id, crawl_uuid):
logger.info(f'Crawl {crawl_uuid} User Agent Recorded')
else:
logger.error(f'Crawl {crawl_uuid} Failed to Record')
# Set the starting URLs to crawl
self.sitemap_urls = [start_url]
logger.info(f'Starting URL set to {start_url}')
# Roll to Parser
for url in self.sitemap_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
# Crawl is done, log the results
logger.info(f'{self.total_urls} crawled, {self.new_urls} new URLs found, {self.updated_urls} URLs updated')
# Set Vars for What's Next
spider = 'cartocrawler'
crawl_type = 'kraken'
kraken_whats_next(self, spider, crawl_type)
# End of Script
# Sitemap Complete
if update_crawl_complete(crawl_uuid, self.new_urls, self.updated_urls):
logger.info(f'Crawl {crawl_uuid} successfully recorded')
else:
logger.error(f'Crawl {crawl_uuid} NOT RECORDED')
def parse(self, response):
"""
Parses the given response and records the crawled URL.
Parameters:
-----------
response : scrapy.Response
The response to parse.
Returns:
--------
None
"""
# Log Found URL
logger.debug(f'Found URL: {response.url}')
# Set variables for recording crawled url
url = response.url
crawl_uuid = self.crawl_uuid
source_url = response.request.headers.get('Referer', None)
# Record crawled URL
action = record_crawled_url(response.url, self.crawl_uuid, response.request.url)
# Update counters
if action == 'add':
self.new_urls += 1
elif action == 'update':
self.updated_urls += 1
def kraken_whats_next(self):
spider = 'cartocrawler'
crawl_type = 'kraken'
logger.info(f'Determining whats next for the {crawl_type}\'s {spider}... ')
if crawl_type == 'kraken':
# What should the Kraken do
logger.info(f'This looks to be a {crawl_type}\'s {spider} ')
if should_sitemap_continue():
logger.info('More sitemaps for the {crawl_type}, lets go again!!! ')
setup_kraken_cartocrawler()
else:
logger.info('No more sitemaps to crawl. Calling it a day... ')
elif crawl_type == 'harpoon':
logger.warning(f'Harpoon asking for help! See manager... ')
# What to do if crawl_type doesn't match
else:
logger.error(f'I\'m lost. Help me! See my manager... ')