๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ kraken.py ยท 77 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77import os
# Scrapy Imports
import scrapy
from scrapy.utils.project import get_project_settings
# Database Operations
from database.select import next_sitemap_url
from database.insert import record_new_crawl
# Utilities
from utils.make import new_uuid
from utils.check import is_url_sitemap, should_sitemap_continue
from logger.config import logger
# from crawls.spiders.cartocrawler import start_cartocrawler
from scrapy.crawler import CrawlerProcess
from crawls.spiders.cartocrawler import CartoCrawler

# Log Emoji: ๐Ÿ•ธ๏ธ๐Ÿฆ‘

def setup_kraken_cartocrawler():
    sitemap, domain_id = next_sitemap_url()
    crawl_uuid = new_uuid()
    logger.debug(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ New UUID Generated: {crawl_uuid} ')
    actor_id = 2
    started_by = 'request'
    crawl_type = 'kraken'
    start_url = sitemap
    crawl_create_status = record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url)

    # Crawl Create Error
    if not crawl_create_status:
        logger.error('๐Ÿ•ธ๏ธ๐Ÿฆ‘ Cartocrawler unable to launch. Check with the Kraken')
        return False
    # Crawl Created Successfully
    elif crawl_create_status:
        logger.info('๐Ÿ•ธ๏ธ๐Ÿฆ‘ New Crawl Created, Checking Sitemap')
        if is_url_sitemap(sitemap):
            logger.info(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ Sitemap is good. Letting CartoCrawler know...')
            process = CrawlerProcess()
            site_url = start_url
            process.crawl(CartoCrawler, start_url=start_url, crawl_uuid=crawl_uuid)
            process.start()


            # start_cartocrawler(start_url, crawl_uuid)

            logger.info(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ CartoCrawler started for {start_url}')
            return True
        else:
            logger.warning(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ {sitemap} is not a sitemap. Set up a loop?')

    # Other Error Logged
    else:
        logger.error('๐Ÿ•ธ๏ธ๐Ÿฆ‘ CartoCrawler Failure. Check with the Kraken')



def kraken_whats_next(crawl_type):
    spider = 'cartocrawler'
    logger.info(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ Determining whats next for the {crawl_type}\'s {spider}... ')
    if crawl_type == 'kraken':
        # What should the Kraken do
        logger.info(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ This looks to be a {crawl_type}\'s {spider} ')
        if should_sitemap_continue():
            logger.info('๐Ÿ•ธ๏ธ๐Ÿฆ‘ More sitemaps for the Kraken, lets go again!!! ')
            setup_kraken_cartocrawler()

        else:
            logger.info('๐Ÿ•ธ๏ธ๐Ÿฆ‘ No more sitemaps to crawl. Calling it a day... ')
    elif crawl_type == 'harpoon':
        logger.warning(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ Harpoon asking for help! See manager... ')
    # What to do if crawl_type doesn't match
    else:
        logger.error(f'๐Ÿ•ธ๏ธ๐Ÿฆ‘ I\'m lost. Help me! See my manager... ')


def setup_kraken_spinnocracy():
    logger.critical('๐Ÿ•ธ๏ธ๐Ÿฆ‘ Oh no! The Kraken Spinnocracy is down...')