1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80import os
import importlib
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import SitemapSpider
# Database Operations
from database.insert import record_crawled_url
from database.update import update_crawl_status, update_crawl_user_agent, update_crawl_complete
# Utils & Configs
from utils.make import get_headers, get_spidey_senses
from logger.config import logger
from scrapy.utils.project import get_project_settings
# Log Emoji: 🕸️🕷️🗺️
class CartoCrawler(SitemapSpider):
name = "carto"
def __init__(self, *args, start_url=None, crawl_uuid=None, **kwargs):
super().__init__(*args, **kwargs)
self.start_url = start_url
self.crawl_uuid = crawl_uuid
logger.debug(f'🕸️🕷️🗺️ Initializing CartoCrawler instance')
# Get Headers
spider = 'cartocrawler'
headers, user_agent_id = get_headers(spider)
logger.debug(f'🕸️🕷️🗺️ {spider} headers set')
# Define spider-specific settings
spidey_senses = {
'CONCURRENT_REQUESTS': 8,
'DNSCACHE_ENABLED': False,
'SITEMAP_FOLLOW': True,
'SITEMAP_ALTERNATIVE_LINKS': True,
'ROBOTS_TXT': False,
'ROBOTSTXT_OBEY': False,
'PRIORITY': 1,
'MAX_URLS': 10000,
}
self.spider_settings = spidey_senses
logger.debug(f'🕸️🕷️🗺️ {spider} senses defined')
# Add User Agent to Crawl Info
if update_crawl_user_agent(user_agent_id, crawl_uuid):
logger.debug(f'🕸️🕷️🗺️ Crawl {crawl_uuid} User Agent Recorded')
else:
logger.error(f'🕸️🕷️🗺️ Crawl {crawl_uuid} Failed to Record')
def parse(self, response):
# Log Found URL
logger.debug(f'🕸️🕷️🗺️ Found URL: {response.url}')
# Set variables for recording crawled url
url = response.url
crawl_uuid = self.crawl_uuid
source_url = response.request.headers.get('Referer', None)
# Record crawled URL
record_crawled_url(url, crawl_uuid, source_url)
# action = record_crawled_url(response.url, self.crawl_uuid, response.request.url)
logger.debug(f'🕸️🕷️🗺️ URL {response.url} added to crawled urls')
def closed(self, reason):
# Sitemap Complete
if update_crawl_complete(self.crawl_uuid, self.new_urls, self.updated_urls):
logger.debug(f'🕸️🕷️🗺️ Crawl {self.crawl_uuid} successfully recorded')
else:
logger.error(f'🕸️🕷️🗺️ Crawl {self.crawl_uuid} NOT RECORDED')
super().closed(reason)