1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156import psycopg2
from database.access import connection
from logger.config import logger
from database.access import connection
from config import SITEMAP_FREQUENCY
from datetime import datetime, timedelta
# Log Emoji: ๐๏ธ๐
def execute_select(query, params=None, fetchone=True):
# Connect to the database
conn = connection()
conn.open()
logger.debug("๐๏ธ๐ Database connection opened")
# Create a cursor
cur = conn.conn.cursor()
# Execute the query
cur.execute(query, params)
# logger.debug(f"๐๏ธ๐ Executed select query: {query}")
logger.debug(f"๐๏ธ๐ Query parameters: {params}")
# Fetch the results if requested
result = None
if fetchone:
result = cur.fetchone() if cur.rowcount > 0 else None
else:
result = cur.fetchall()
# Close the cursor and connection
cur.close()
conn.close()
logger.debug("๐๏ธ๐ Cursor and connection closed")
return result
def next_sitemap_url():
query = """
WITH null_crawl AS (
SELECT
url AS "sitemap_url",
id AS "sitemap_id",
domain_id
FROM targets.sitemaps
WHERE recent_crawl_id IS NULL
AND active_crawl is TRUE
ORDER BY RANDOM()
LIMIT 1
),
random_row AS (
SELECT
url AS "sitemap_url",
id AS "sitemap_id",
domain_id
FROM targets.sitemaps
WHERE NOT EXISTS (SELECT 1 FROM null_crawl) AND
(current_timestamp - updated_at) > INTERVAL '24 hours'
AND active_crawl is TRUE
ORDER BY RANDOM()
LIMIT 1
)
SELECT * FROM null_crawl
UNION ALL
SELECT * FROM random_row
LIMIT 1;
"""
result = execute_select(query, ())
if result:
sitemap_url, sitemap_id, domain_id = result
logger.info(f'๐๏ธ๐ Sitemap Selected for Domain {domain_id}: {sitemap_url}')
return sitemap_url, sitemap_id, domain_id
else:
logger.error('๐๏ธ๐ Unable to select sitemap')
return None, None
def get_useragent():
query = """
SELECT user_agent, id as "user_agent_id"
FROM meta.user_agents
WHERE active=TRUE
ORDER BY RANDOM()
LIMIT 1;
"""
result = execute_select(query)
logger.debug(f'๐๏ธ๐ User Agent result from query: {result}')
if result:
user_agent, user_agent_id = result
logger.info(f'๐๏ธ๐ Useragent Selected: {user_agent}')
return user_agent, user_agent_id
else:
logger.error('๐๏ธ๐ No Useragent Returned')
return None, None
def select_pending_sitemap_count():
query = """
SELECT count(*)
FROM meta.sitemaps
LEFT JOIN events.crawls ON meta.sitemaps.crawl_uuid = events.crawls.crawl_uuid
WHERE meta.sitemaps.crawl_uuid is NULL OR events.crawls.ended_at < %s;
"""
# calculate the datetime value SITEMAP_FREQUENCY hours ago
frequency = timedelta(hours=SITEMAP_FREQUENCY)
past_time = datetime.utcnow() - frequency
logger.debug(f'๐๏ธ๐ Executing query: {query}')
logger.debug(f'๐๏ธ๐ Query parameters: {past_time}')
result = execute_select(query, (past_time,))
logger.info(f'{result} Sitemaps to Crawl')
return result
def next_rosevelt_url():
query = """
WITH null_crawl AS (
SELECT
id AS "url_id",
url,
domain_id
FROM targets.urls
WHERE crawled_at_rosevelt IS NULL
ORDER BY RANDOM()
LIMIT 1
),
oldest_crawl AS (
SELECT
id AS "url_id",
url,
domain_id
FROM targets.urls
WHERE NOT EXISTS (SELECT 1 FROM null_crawl)
ORDER BY crawled_at_rosevelt ASC
LIMIT 200
),
random_oldest_row AS (
SELECT *
FROM oldest_crawl
ORDER BY RANDOM()
LIMIT 1
)
SELECT * FROM null_crawl
UNION ALL
SELECT * FROM random_oldest_row
LIMIT 1;
"""
result = execute_select(query, ())
if result:
url_id, url, domain_id = result
logger.debug(f'๐๏ธ๐ URL Selected for Domain {domain_id}: {url}')
return url_id, url, domain_id
else:
logger.error('๐๏ธ๐ Unable to select URL')
return None