1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141import scrapy
from scrapy.spiders import SitemapSpider
from logger.config import logger
from database.update import update_crawl_status, update_crawl_user_agent, update_crawl_complete
from database.insert import record_crawled_url
from utils.make import get_headers, get_spidey_senses
from utils.check import should_sitemap_continue
from crawls.kraken import setup_cartocrawler
class CartoCrawler(SitemapSpider):
"""
A Scrapy spider for crawling sitemaps.
Attributes
----------
total_urls : int
Total number of URLs crawled
new_urls : int
Number of new URLs found
updated_urls : int
Number of URLs updated
Methods
-------
start_cartocrawler(crawl_uuid, start_url)
Starts the CartoCrawler with the given crawl UUID and start URL.
parse(response)
Parses the given response and records the crawled URL.
cartocrawler_future()
Determines what the CartoCrawler should do next.
"""
total_urls = 0
new_urls = 0
updated_urls = 0
def start_cartocrawler(self, crawl_uuid, start_url):
"""
Starts the CartoCrawler with the given crawl UUID and start URL.
Parameters
----------
crawl_uuid : str
The UUID of the crawl
start_url : str
The URL of the sitemap to start crawling
"""
# Update Crawl Status to queued
status = 'queued'
update_crawl_status(status, crawl_uuid)
logger.info(f'Spinning up CartoCrawler... ')
# Get Headers
spider = 'cartocrawler'
headers, user_agent_id = get_headers(spider)
logger.info('CartoCrawler headers set')
# Define spider-specific settings
spidey_senses = {
'CONCURRENT_REQUESTS': 8,
'DNSCACHE_ENABLED': False,
'SITEMAP_FOLLOW': True,
'SITEMAP_ALTERNATIVE_LINKS': True,
'ROBOTS_TXT': False,
'ROBOTSTXT_OBEY': False,
'PRIORITY': 1,
'MAX_URLS': 10000,
}
spidey_senses.update(get_spidey_senses(self))
self.spider_settings = spidey_senses
logger.info(f'{spider} senses defined')
# Add User Agent to Crawl Info
if execute_update(update_crawl_user_agent, user_agent_id, crawl_uuid):
logger.info(f'Crawl {crawl_uuid} User Agent Recorded')
else:
logger.error(f'Crawl {crawl_uuid} Failed to Record')
# Set the starting URLs to crawl
self.sitemap_urls = [start_url]
logger.info(f'Starting URL set to {start_url}')
# Roll to Parser
for url in self.sitemap_urls:
yield scrapy.Request(url, headers=headers, callback=self.parse)
# Crawl is done, log the results
logger.info(f'{self.total_urls} crawled, {self.new_urls} new URLs found, {self.updated_urls} URLs updated')
# Sitemap Complete
if update_crawl_complete(crawl_uuid, self.new_urls, self.updated_urls):
logger.info(f'Crawl {crawl_uuid} successfully recorded')
else:
logger.error(f'Crawl {crawl_uuid} NOT RECORDED')
def parse(self, response):
"""
Parses the given response and records the crawled URL.
Parameters:
-----------
response : scrapy.Response
The response to parse.
Returns:
--------
None
"""
# Log Found URL
logger.debug(f'Found URL: {response.url}')
# Set variables for recording crawled url
url = response.url
crawl_uuid = self.crawl_uuid
source_url = response.request.headers.get('Referer', None)
# Record crawled URL
action = record_crawled_url(response.url, self.crawl_uuid, response.request.url)
# Update counters
if action == 'add':
self.new_urls += 1
elif action == 'update':
self.updated_urls += 1
def cartocrawler_future(self):
"""
Determines the next action for CartoCrawler.
If there are more sitemaps to crawl, calls the setup_cartocrawler() function.
Otherwise, logs that there are no more sitemaps to crawl and terminates the crawl.
"""
# What should cartocrawler do next?
if should_sitemap_continue():
logger.info('More sitemaps to crawl, lets go again!!! ')
setup_cartocrawler()
else:
logger.info('No more sitemaps to crawl. Calling it a day... ')