๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ kraken.py ยท 68 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68import os
# Scrapy Imports
import scrapy
from scrapy.utils.project import get_project_settings
# Database Operations
from database.select import next_sitemap_url
from database.insert import record_new_crawl
# Utilities
from utils.make import new_uuid
from utils.check import is_url_sitemap
from logger.config import logger
from crawls.spiders.cartocrawler import start_cartocrawler
from scrapy.crawler import CrawlerProcess



def setup_kraken_cartocrawler():
    sitemap, domain_id = next_sitemap_url()
    crawl_uuid = new_uuid()
    logger.debug(f'New UUID Generated: {crawl_uuid} ')
    actor_id = 2
    started_by = 'request'
    crawl_type = 'kraken'
    start_url = sitemap
    crawl_create_status = record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url)

    # Crawl Create Error
    if not crawl_create_status:
        logger.error('๐Ÿฆ‘ Cartocrawler unable to launch. Check with the Kraken')
        return False
    # Crawl Created Successfully
    elif crawl_create_status:
        logger.info('๐Ÿฆ‘ New Crawl Created, Checking Sitemap')
        if is_url_sitemap(sitemap):
            logger.info(f'๐Ÿฆ‘ Sitemap is good. Letting CartoCrawler know...')
            site_url = start_url

            start_cartocrawler(start_url, crawl_uuid)
            return True
        else:
            logger.info(f'{sitemap} is not a sitemap. Set up a loop?')

    # Other Error Logged
    else:
        logger.error('๐Ÿฆ‘ CartoCrawler Failure. Check with the Kraken')

def kraken_whats_next(crawl_type):
    spider = 'cartocrawler'
    logger.info(f'Determining whats next for the {crawl_type}\'s {spider}... ')
    if crawl_type == 'kraken':
        # What should the Kraken do
        logger.info(f'This looks to be a {crawl_type}\'s {spider} ')
        if should_sitemap_continue():
            logger.info('More sitemaps for the {crawl_type}, lets go again!!! ')
            setup_kraken_cartocrawler()
        else:
            logger.info('No more sitemaps to crawl. Calling it a day... ')
    elif crawl_type == 'harpoon':
        logger.warning(f'Harpoon asking for help! See manager... ')
    # What to do if crawl_type doesn't match
    else:
        logger.error(f'I\'m lost. Help me! See my manager... ')


def setup_kraken_spinnocracy():
    logger.error('๐Ÿฆ‘ Help me Jim! I\'m lost in the Kraken')