📦 EqualifyEverything / crawler

📄 harpoon.py · 113 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113import json
from logger.config import logger
from flask import jsonify
from utils.sorter import is_url_sitemap

def harpoon_handler(payload):
    target = payload.get('domain', None) or payload.get('url', None)
    if not target:
        error_message = "Bad Request!🐶 Check your TARGET and resend, or don't..."
        response = jsonify({'status': 'error', 'message': error_message})
        response.status_code = 400
        return response
    # Check if target is a sitemap or not
    if is_url_sitemap(url):
        # URL is a sitemap
        if


    else:
        # URL is NOT a sitemap



        return is_sitemap(url)
        if is_sitemap is TRUE:
            if



    # Check if target is a sitemap or not with utils/sorter

    # Perform the targeted crawl
    results = throw_harpoon(target)


    #






    # Create the response object
    response = {'status': 'success', 'results': results}

    # Return the response as a JSON string
    return jsonify(response)











def harpoon_crawl(payload):
    """
    Handles a harpoon crawl request.

    This function takes a JSON payload containing the "target" parameter, which should be a URL or domain to crawl,
    and the "database" and "output" parameters, which control whether the results should be written to the database and
    output in the response, respectively.

    Returns:
        str: A JSON string containing the response data.
    """
    try:
        # Parse the payload JSON and extract the target parameter
        data = json.loads(payload)
        target = data["target"]

        # Check if the target is a valid URL or domain
        # Your validation logic here

        # Log a message indicating that the crawl is starting
        logger.info("Good Request. Let's roll")

        # Perform the crawl and generate the response
        # Your crawl logic here

        # Return the response
        # Your response generation logic here
    except (ValueError, KeyError):
        # If the payload is invalid or the target parameter is missing, log an error

        logger.error("Bad Request!🐶 Check your TARGET and resend, or don't...")
        # Return an error response
        # Your error response generation logic here





"""
Harpoon
This is for targeted crawls.

This function makes use of the below input

type=harpoon
Body
    target: string is required contains url or domain
    database: bool - decides if should write results to database or not. defaults to true
    output: bool    - decides if should output results vis response to request or not. defaults to false

I want to first check if the body, formatted in JSON, has a target that is either a domain or url. If it does, log.info:"Good Request. Lets Roll" if it is not a good url or domain, log.error:"Bad Request! [emoji for bad dog] Target invalid. Try again, or don't



"""