๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ kraken.py ยท 51 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51import os
import threading
from logger.config import logger
from database.select import next_sitemap_url
from utils.make import new_uuid
from database.insert import record_new_crawl
from utils.check import is_url_sitemap
from crawls.spiders.cartocrawler import CartoCrawler
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

def run_spider_in_thread(start_url):
    process = CrawlerProcess(get_project_settings())
    process.crawl(CartoCrawler, start_url=start_url)
    process.start(stop_after_crawl=True)  # This is the correct place for the parameter

def setup_kraken_cartocrawler():
    sitemap, domain_id = next_sitemap_url()
    crawl_uuid = new_uuid()
    logger.debug(f'New UUID Generated: {crawl_uuid} ')
    actor_id = 2
    started_by = 'request'
    crawl_type = 'kraken'
    start_url = sitemap
    crawl_create_status = record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url)

    # Crawl Create Error
    if not crawl_create_status:
        logger.error('๐Ÿฆ‘ Cartocrawler unable to launch. Check with the Kraken')
        return False
    # Crawl Created Successfully
    elif crawl_create_status:
        logger.info('๐Ÿฆ‘ New Crawl Created, Checking Sitemap')
        if is_url_sitemap(sitemap):
            logger.info(f'๐Ÿฆ‘ Sitemap is good. Letting CartoCrawler know...')
            t = threading.Thread(target=run_spider_in_thread, args=(start_url,))
            t.start()
        else:
            logger.info(f'{sitemap} is not a sitemap. Set up a loop?')

    # Other Error Logged
    else:
        logger.error('๐Ÿฆ‘ CartoCrawler Failure. Check with the Kraken')



def setup_kraken_spinnocracy():
    logger.error('๐Ÿฆ‘ Help me Jim! I\'m lost in the Kraken!  ')