๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ kraken.py ยท 75 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75import json
import subprocess
from logger.config import logger
from flask import jsonify
from database.select import next_sitemap_url
from database.update import update_crawl_status
from database.insert import record_new_crawl
from utils.make import new_uuid
from utils.check import is_url_sitemap
from crawls.spiders import start_cartocrawler

This script takes the json body payload of 'spider' and executes the approperiate function



def spider_finder(payload):


    # Get the spider parameter from the payload
    spider_type = payload.get('spider')

    if spider_type == 'cartocrawler':
        response_message = 'CartoCrawler has been started!'
        logger.info(f' ๐Ÿ•ท๏ธ ๐Ÿš€ CartoCrawler')
        launch_cartocrawler()

    # Spinocracy
    # INFO Search for all URLs on a domain, not just the sitemap.
    elif spider_type == 'spinocracy':
        response_message = 'Spinocracy has started!'
        logger.info(f' ๐Ÿ•ท๏ธ ๐Ÿš€ Spinnocracy')
        launch_spinocracy()

    # No defined spider or other error
    else:
        # Return an error message if the kraken_type is invalid
        error_msg = {'error': 'Bad Spider. Check spider variable. '}
        logger.error(f' ๐Ÿ•ท๏ธ ๐Ÿ’€ Spider selection failed. Check spider_type ')
        response = jsonify(error_msg)
        response.status_code = 400
        return response


def setup_cartocrawler():
    sitemap, domain_id = next_sitemap_url()
    crawl_uuid = new_uuid()
    actor_id = 'cartocrawler'
    started_by = 'request'
    crawl_type = 'kraken'
    start_url = sitemap
    crawl_create_status = record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url)

    # Crawl Create Error
    if not crawl_create_status:
        logger.error('Cartocrawler unable to launch. Check launch_cartocrawler')

    # Crawl Created Successfully
    elif crawl_create_status:
        logger.info('New Crawl Created, Checking Sitemap')
        if is_url_sitemap(sitemap):
            start_cartocrawler(crawl_uuid, start_url)
            logger.info(f'Sitemap is good. Letting CartoCrawler know...')
        else:
            logger.info(f'{sitemap} is not a sitemap. Set up a loop?')

    # Other Error Logged
    else:
        logger.error('CartoCrawler Failure. See launch_cartocrawler')