๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ select.py ยท 93 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93import psycopg2
from database.access import connection
from logger.config import logger
from database.access import connection
from config import SITEMAP_FREQUENCY
from datetime import datetime, timedelta
# Log Emoji: ๐Ÿ—„๏ธ๐Ÿ”


def execute_select(query, params=None, fetchone=True):
    # Connect to the database
    conn = connection()
    conn.open()
    logger.debug("๐Ÿ—„๏ธ๐Ÿ” Database connection opened")

    # Create a cursor
    cur = conn.conn.cursor()

    # Execute the query
    cur.execute(query, params)
    # logger.debug(f"๐Ÿ—„๏ธ๐Ÿ” Executed select query: {query}")
    logger.debug(f"๐Ÿ—„๏ธ๐Ÿ” Query parameters: {params}")

    # Fetch the results if requested
    result = None
    if fetchone:
        result = cur.fetchone() if cur.rowcount > 0 else None
    else:
        result = cur.fetchall()

    # Close the cursor and connection
    cur.close()
    conn.close()
    logger.debug("๐Ÿ—„๏ธ๐Ÿ” Cursor and connection closed")

    return result

def next_sitemap_url():
    query = """
        SELECT url, domain_id
        FROM meta.sitemaps
        WHERE meta.sitemaps.crawl_uuid = '' OR meta.sitemaps.crawl_uuid IS NULL AND active=TRUE
        ORDER BY
          CASE WHEN meta.sitemaps.crawl_uuid = '' OR meta.sitemaps.crawl_uuid IS NULL
            THEN created_at
            ELSE (SELECT ended_at FROM events.crawls WHERE events.crawls.crawl_uuid = meta.sitemaps.crawl_uuid)
          END ASC
        LIMIT 1;
    """
    result = execute_select(query)
    if result:
        url, domain_id = result
        logger.info(f'๐Ÿ—„๏ธ๐Ÿ” Sitemap Selected for Domain {domain_id}: {url}')
        return url, domain_id
    else:
        logger.error('๐Ÿ—„๏ธ๐Ÿ” Unable to select sitemap')
        return None, None


def get_useragent():
        query = """
            SELECT user_agent, id as "user_agent_id"
            FROM meta.user_agents
            WHERE active=TRUE
            ORDER BY RANDOM()
            LIMIT 1;
        """
        result = execute_select(query)
        logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” User Agent result from query: {result}')
        if result:
            user_agent, user_agent_id = result
            logger.info(f'๐Ÿ—„๏ธ๐Ÿ” Useragent Selected: {user_agent}')
            return user_agent, user_agent_id
        else:
            logger.error('๐Ÿ—„๏ธ๐Ÿ” No Useragent Returned')
            return None, None


def select_pending_sitemap_count():
    query = """
        SELECT count(*)
        FROM meta.sitemaps
        LEFT JOIN events.crawls ON meta.sitemaps.crawl_uuid = events.crawls.crawl_uuid
        WHERE meta.sitemaps.crawl_uuid is NULL OR events.crawls.ended_at < %s;
    """
    # calculate the datetime value SITEMAP_FREQUENCY hours ago
    frequency = timedelta(hours=SITEMAP_FREQUENCY)
    past_time = datetime.utcnow() - frequency
    logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” Executing query: {query}')
    logger.debug(f'๐Ÿ—„๏ธ๐Ÿ” Query parameters: {past_time}')
    result = execute_select(query, (past_time,))
    logger.info(f'{result} Sitemaps to Crawl')
    return result