๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ select.py ยท 156 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156import psycopg2
from database.access import connection
from logger.config import logger
from database.access import connection
from config import SITEMAP_FREQUENCY
from datetime import datetime, timedelta
# Log Emoji: ๐Ÿ—„๏ธ๐Ÿ”


def execute_select(query, params=None, fetchone=True):
    # Connect to the database
    conn = connection()
    conn.open()
    logger.debug("๐Ÿ—„๏ธ๐Ÿ” Database connection opened")

    # Create a cursor
    cur = conn.conn.cursor()

    # Execute the query
    cur.execute(query, params)
    # logger.debug(f"๐Ÿ—„๏ธ๐Ÿ” Executed select query: {query}")
    logger.debug(f"๐Ÿ—„๏ธ๐Ÿ” Query parameters: {params}")

    # Fetch the results if requested
    result = None
    if fetchone:
        result = cur.fetchone() if cur.rowcount > 0 else None
    else:
        result = cur.fetchall()

    # Close the cursor and connection
    cur.close()
    conn.close()
    logger.debug("๐Ÿ—„๏ธ๐Ÿ” Cursor and connection closed")

    return result


def next_sitemap_url():
    query = """
        WITH null_crawl AS (
            SELECT
                url AS "sitemap_url",
                id AS "sitemap_id",
                domain_id
            FROM targets.sitemaps
            WHERE recent_crawl_id IS NULL
            AND active_crawl is TRUE
            ORDER BY RANDOM()
            LIMIT 1
        ),
        random_row AS (
            SELECT
                url AS "sitemap_url",
                id AS "sitemap_id",
                domain_id
            FROM targets.sitemaps
            WHERE NOT EXISTS (SELECT 1 FROM null_crawl) AND
                  (current_timestamp - updated_at) > INTERVAL '24 hours'
                  AND active_crawl is TRUE
            ORDER BY RANDOM()
            LIMIT 1
        )
        SELECT * FROM null_crawl
        UNION ALL
        SELECT * FROM random_row
        LIMIT 1;
    """
    result = execute_select(query, ())
    if result:
        sitemap_url, sitemap_id, domain_id = result
        logger.info(f'๐Ÿ—„๏ธ๐Ÿ” Sitemap Selected for Domain {domain_id}: {sitemap_url}')
        return sitemap_url, sitemap_id, domain_id
    else:
        logger.error('๐Ÿ—„๏ธ๐Ÿ” Unable to select sitemap')
        return None, None


def get_useragent():
        query = """
            SELECT user_agent, id as "user_agent_id"
            FROM meta.user_agents
            WHERE active=TRUE
            ORDER BY RANDOM()
            LIMIT 1;
        """
        result = execute_select(query)
        logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” User Agent result from query: {result}')
        if result:
            user_agent, user_agent_id = result
            logger.info(f'๐Ÿ—„๏ธ๐Ÿ” Useragent Selected: {user_agent}')
            return user_agent, user_agent_id
        else:
            logger.error('๐Ÿ—„๏ธ๐Ÿ” No Useragent Returned')
            return None, None



def select_pending_sitemap_count():
    query = """
        SELECT count(*)
        FROM meta.sitemaps
        LEFT JOIN events.crawls ON meta.sitemaps.crawl_uuid = events.crawls.crawl_uuid
        WHERE meta.sitemaps.crawl_uuid is NULL OR events.crawls.ended_at < %s;
    """
    # calculate the datetime value SITEMAP_FREQUENCY hours ago
    frequency = timedelta(hours=SITEMAP_FREQUENCY)
    past_time = datetime.utcnow() - frequency
    logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” Executing query: {query}')
    logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” Query parameters: {past_time}')
    result = execute_select(query, (past_time,))
    logger.info(f'{result} Sitemaps to Crawl')
    return result

def next_rosevelt_url():
    query = """
        WITH null_crawl AS (
            SELECT
                id AS "url_id",
                url,
                domain_id
            FROM targets.urls
            WHERE crawled_at_rosevelt IS NULL
            ORDER BY RANDOM()
            LIMIT 1
        ),
        oldest_crawl AS (
            SELECT
                id AS "url_id",
                url,
                domain_id
            FROM targets.urls
            WHERE NOT EXISTS (SELECT 1 FROM null_crawl)
            ORDER BY crawled_at_rosevelt ASC
            LIMIT 200
        ),
        random_oldest_row AS (
            SELECT *
            FROM oldest_crawl
            ORDER BY RANDOM()
            LIMIT 1
        )
        SELECT * FROM null_crawl
        UNION ALL
        SELECT * FROM random_oldest_row
        LIMIT 1;
    """
    result = execute_select(query, ())
    if result:
        url_id, url, domain_id = result
        logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” URL Selected for Domain {domain_id}: {url}')
        return url_id, url, domain_id
    else:
        logger.error('๐Ÿ—„๏ธ๐Ÿ” Unable to select URL')
        return None