๐Ÿ“ฆ EqualifyEverything / crawler

๐Ÿ“„ insert.py ยท 83 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83import psycopg2
from database.access import connection
from logger.config import logger

def execute_insert(query, params=None, fetchone=True):
    # logger.debug(f"๐Ÿ—„๏ธโœ๏ธ Executing query: {query}")
    logger.debug(f"๐Ÿ—„๏ธโœ๏ธ Query parameters: {params}")

    # Connect to the database
    conn = connection()
    conn.open()
    logger.debug("๐Ÿ—„๏ธโœ๏ธ Database connection opened")

    # Create a cursor
    cur = conn.conn.cursor()

    try:
        # Execute the query
        cur.execute(query, params)
        conn.conn.commit()
        logger.info("๐Ÿ—„๏ธโœ๏ธ Query executed and committed")

        # Fetch the results if requested
        result = None
        if fetchone:
            result = cur.fetchone() if cur.rowcount > 0 else None
        else:
            result = cur.fetchall()
            logger.debug(f'๐Ÿ—„๏ธโœ๏ธ Fetched results: {result}')
    except Exception as e:
        logger.error(f"๐Ÿ—„๏ธโœ๏ธ Error executing insert query: {e}")
        result = None

    # Close the cursor and connection
    cur.close()
    conn.close()
    logger.debug("๐Ÿ—„๏ธโœ๏ธ Cursor and connection closed")

    return result


def record_new_crawl(actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url):
    logger.info(f"๐Ÿ—„๏ธโœ๏ธ Recording new crawl: {crawl_uuid}")
    query = """
        INSERT INTO events.crawls (
            actor_id,
            status,
            started_by,
            crawl_uuid,
            crawl_type,
            domain_id,
            start_url
        )
        VALUES (
            %s, 'new', %s,
            %s, %s, %s, %s
        )
        RETURNING crawl_uuid
        """
    result = execute_insert(query, (actor_id, started_by, crawl_uuid, crawl_type, domain_id, start_url))
    logger.debug(f'๐Ÿ—„๏ธโœ๏ธ Record New Crawl Output: {result} ')
    if result[0] == crawl_uuid:
        logger.debug(f'๐Ÿ—„๏ธโœ๏ธ Crawl {crawl_uuid} created... ')
        return True
    else:
        logger.error(f'๐Ÿ—„๏ธโœ๏ธ Record New Crawl had an issue with: {crawl_uuid} ')
        return False


def record_crawled_url(url, crawl_uuid, source_url):
    logger.info(f"๐Ÿ—„๏ธโœ๏ธ Recording crawled URL: {url}")
    query = """
        INSERT INTO results.urls (url,
            crawl_uuid,
            source_url,
            last_crawled_at)
        VALUES (%s, %s, %s, NOW())
        ON CONFLICT (url) DO UPDATE SET last_crawled_at = NOW();
    """
    logger.debug(f'๐Ÿ—„๏ธ โœ๏ธ URL To Record: {url} ')
    execute_insert(query, (url, crawl_uuid, source_url))