1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68import os
# Scrapy Imports
import scrapy
from scrapy.utils.project import get_project_settings
# Database Operations
from database.select import next_sitemap_url
from database.insert import record_new_crawl
# Utilities
from utils.make import new_uuid
from utils.check import is_url_sitemap
from logger.config import logger
from crawls.spiders.cartocrawler import start_cartocrawler
from scrapy.crawler import CrawlerProcess
# Log Emoji: ๐ธ๏ธ๐ฆ
def setup_kraken_cartocrawler():
sitemap, domain_id = next_sitemap_url()
crawl_uuid = new_uuid()
logger.debug(f'๐ธ๏ธ๐ฆ New UUID Generated: {crawl_uuid} ')
actor_id = 2
started_by = 'request'
crawl_type = 'kraken'
start_url = sitemap
crawl_create_status = record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url)
# Crawl Create Error
if not crawl_create_status:
logger.error('๐ธ๏ธ๐ฆ Cartocrawler unable to launch. Check with the Kraken')
return False
# Crawl Created Successfully
elif crawl_create_status:
logger.info('๐ธ๏ธ๐ฆ New Crawl Created, Checking Sitemap')
if is_url_sitemap(sitemap):
logger.info(f'๐ธ๏ธ๐ฆ Sitemap is good. Letting CartoCrawler know...')
site_url = start_url
start_cartocrawler(start_url, crawl_uuid)
logger.info(f'๐ธ๏ธ๐ฆ CartoCrawler started for {start_url}')
return True
else:
logger.warning(f'๐ธ๏ธ๐ฆ {sitemap} is not a sitemap. Set up a loop?')
# Other Error Logged
else:
logger.error('๐ธ๏ธ๐ฆ CartoCrawler Failure. Check with the Kraken')
def kraken_whats_next(crawl_type):
spider = 'cartocrawler'
logger.info(f'๐ธ๏ธ๐ฆ Determining whats next for the {crawl_type}\'s {spider}... ')
if crawl_type == 'kraken':
# What should the Kraken do
logger.info(f'๐ธ๏ธ๐ฆ This looks to be a {crawl_type}\'s {spider} ')
if should_sitemap_continue():
logger.info('๐ธ๏ธ๐ฆ More sitemaps for the Kraken, lets go again!!! ')
setup_kraken_cartocrawler()
else:
logger.info('๐ธ๏ธ๐ฆ No more sitemaps to crawl. Calling it a day... ')
elif crawl_type == 'harpoon':
logger.warning(f'๐ธ๏ธ๐ฆ Harpoon asking for help! See manager... ')
# What to do if crawl_type doesn't match
else:
logger.error(f'๐ธ๏ธ๐ฆ I\'m lost. Help me! See my manager... ')
def setup_kraken_spinnocracy():
logger.critical('๐ธ๏ธ๐ฆ Oh no! The Kraken Spinnocracy is down...')