📦 EqualifyEverything / database

📄 Script-1.sql · 399 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
-- 
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
  NEW.updated_at = NOW();
  RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- Add trigger to each table
CREATE TRIGGER update_updated_at
BEFORE UPDATE ON staging.urls
FOR EACH ROW
EXECUTE FUNCTION update_updated_at();



-- a11yPython User
GRANT SELECT ON meta.domains TO a11yPython;
GRANT INSERT ON staging.urls TO a11yPython;


SELECT "domain" 
FROM meta.domains
WHERE crawl = TRUE;


SELECT count(DISTINCT(url)),
count (url )
FROM staging.urls u ;


-- Delete duplicate urls from staging.urls
DELETE FROM staging.urls
WHERE url IN (
    SELECT url FROM (
        SELECT url,
        ROW_NUMBER() OVER (
            PARTITION BY url
            ORDER BY id DESC
        ) AS row_num
        FROM staging.urls
    ) t
    WHERE t.row_num > 1
);



-- FUNCTIONS

-- Update Last Crawl At in Domains
CREATE OR REPLACE FUNCTION update_last_crawl_at() RETURNS trigger AS $$
BEGIN
    UPDATE meta.domains
    SET last_crawl_at = NEW.created_at
    WHERE domain = NEW.domain;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- delete offsite urls
CREATE TRIGGER offsite_links_trigger
AFTER INSERT ON staging.offsite_links
FOR EACH ROW
EXECUTE FUNCTION delete_from_staging_urls();







-- process
CREATE OR REPLACE FUNCTION process_staging_urls() RETURNS void AS $$
	DECLARE
	  record RECORD;
	  domain_var TEXT;
	BEGIN
	  -- Loop through each row in the staging.urls table
	  FOR record IN SELECT * FROM staging.urls
	  LOOP
	    -- Get the domain associated with the current row's python_uuid
	    SELECT domain INTO STRICT domain_var FROM results.crawls WHERE python_uuid = record.python_uuid;
	
	    -- Check if the url is an offsite link
	    IF NOT (record.url ~ ('^https?://' || domain_var) OR record.url ~ '^https?://localhost') THEN
	      -- If it is an offsite link, insert it into the staging.offsite_links table
	      INSERT INTO staging.offsite_links (url, source_url, source_domain, python_uuid)
	      VALUES (record.url, record.source_url, domain_var, record.python_uuid);
	    END IF;
	  END LOOP;
	END;
$$ LANGUAGE plpgsql;


SELECT process_staging_urls();



--- Delete offsite from process_staging_urls 
CREATE OR REPLACE FUNCTION delete_from_staging_urls() RETURNS TRIGGER AS $$
BEGIN
    DELETE FROM staging.urls WHERE source_url = NEW.source_url AND url = NEW.url;
    RETURN NULL;
END;
$$ LANGUAGE plpgsql;







-- Triggers
CREATE TRIGGER update_last_crawl_at_trigger
AFTER INSERT ON results.offsite_links 
FOR EACH ROW
EXECUTE FUNCTION update_last_crawl_at();


-- Process Offsite_urls from staging.urls u 
SELECT process_staging_urls();


-- Get Total Count of URLs
SELECT count(*)
FROM staging.urls;






-- Get Results by Domain
SELECT crawls.domain AS "Domain",
    COUNT(urls.id) AS "Total",
    COUNT(CASE WHEN urls.updated_at >= urls.created_at 
        AND urls.updated_at > NOW() - INTERVAL '5 minutes' 
            THEN urls.id ELSE NULL END) AS "Recent Count"
FROM results.crawls
LEFT JOIN staging.urls ON crawls.python_uuid = urls.python_uuid
GROUP BY crawls.DOMAIN
ORDER BY "Recent Count" DESC;








-- Get Document Types
SELECT 
    CASE 
        WHEN url LIKE '%.pdf' THEN 'pdf'
        WHEN url LIKE '%.doc' THEN 'doc'
        WHEN url LIKE '%.docx' THEN 'docx'
        WHEN url LIKE '%.ppt' THEN 'ppt'
        WHEN url LIKE '%.pptx' THEN 'pptx'
        WHEN url LIKE '%.xls' THEN 'xls'
        WHEN url LIKE '%.xlsx' THEN 'xlsx'
        WHEN url LIKE '%.csv' THEN 'csv'
        WHEN url LIKE '%.zip' THEN 'zip'
        WHEN url LIKE '%.pages' THEN 'pages'
    END AS "file type",
    COUNT(*) AS "total count",
    COUNT(CASE WHEN created_at > NOW() - INTERVAL '5 minutes' THEN 1 ELSE NULL END) AS "5 minute count"
FROM staging.doc_urls
WHERE url LIKE '%.pdf' 
    OR url LIKE '%.doc' 
    OR url LIKE '%.docx' 
    OR url LIKE '%.ppt' 
    OR url LIKE '%.pptx' 
    OR url LIKE '%.xls' 
    OR url LIKE '%.xlsx' 
    OR url LIKE '%.csv' 
    OR url LIKE '%.zip' 
    OR url LIKE '%.pages'
GROUP BY "file type";


SELECT domain, last_crawl_at FROM meta.domains WHERE crawl = TRUE AND active = TRUE ORDER BY last_crawl_at ASC NULLS FIRST LIMIT 1;







SELECT * FROM results.crawls c 
WHERE python_uuid = '9c31d7f5-2893-42f2-b925-ccac12ff4f99';




-- Recently Added Rows to Sub-Tables
SELECT
    (SELECT COUNT(*) FROM staging.bad_urls WHERE created_at >= NOW() - INTERVAL '10 minutes') AS bad_urls_count,
    (SELECT COUNT(*) FROM staging.doc_urls WHERE created_at >= NOW() - INTERVAL '10 minutes') AS doc_urls_count,
    (SELECT COUNT(*) FROM staging.image_urls WHERE created_at >= NOW() - INTERVAL '10 minutes') AS image_urls_count,
    (SELECT COUNT(*) FROM staging.urls WHERE created_at >= NOW() - INTERVAL '10 minutes') AS urls_count;
   
   -- Get Results by Domain
SELECT crawls.domain AS "Domain",
    COUNT(urls.id) AS "Total",
    COUNT(CASE WHEN urls.updated_at >= urls.created_at 
        AND urls.updated_at > NOW() - INTERVAL '5 minutes' 
            THEN urls.id ELSE NULL END) AS "Recent Count"
FROM results.crawls
LEFT JOIN staging.urls ON crawls.python_uuid = urls.python_uuid
GROUP BY crawls.DOMAIN
ORDER BY "Recent Count" DESC;
   
   
   -- Recently Added Rows to Sub-Tables
SELECT
    (SELECT COUNT(*) FROM staging.bad_urls) AS bad_urls_count,
    (SELECT COUNT(*) FROM staging.doc_urls) AS doc_urls_count,
    (SELECT COUNT(*) FROM staging.image_urls) AS image_urls_count,
    (SELECT COUNT(*) FROM staging.urls) AS urls_count;
   
   SELECT url, id as "url_id" FROM staging.urls WHERE active=true AND NOT scanned_by_axe ORDER BY created_at DESC LIMIT 1;
  
  ALTER TABLE results.nodes
ALTER COLUMN target TYPE VARCHAR(255);

ALTER TABLE results.nodes ADD COLUMN data JSON;


  
  


   
   

   SELECT DISTINCT count(url)
   FROM staging.bad_urls
   WHERE python_uuid = 'f35da7b1-270e-41bc-a12e-a10c6c9c43e4';

   SELECT url
   -- count(*)
   FROM staging.image_urls iu
   WHERE created_at >= NOW() - INTERVAL '5 minutes';
  


SELECT 'crawls' as table_name, count(*) as total_rows FROM results.crawls
UNION ALL
SELECT 'items' as table_name, count(*) as total_rows FROM results.items
UNION ALL
SELECT 'nodes' as table_name, count(*) as total_rows FROM results.nodes
UNION ALL
SELECT 'scans' as table_name, count(*) as total_rows FROM results.scans
UNION ALL
SELECT 'subnodes' as table_name, count(*) as total_rows FROM results.subnodes
UNION ALL
SELECT 'urls' as table_name, count(*) as total_rows FROM staging.urls;















SELECT domain, last_crawl_at
FROM meta.domains
WHERE crawl = TRUE AND active = TRUE
ORDER BY last_crawl_at ASC NULLS FIRST
LIMIT 1;

SELECT domain, last_crawl_at FROM meta.domains WHERE crawl = TRUE AND active = TRUE ORDER BY last_crawl_at ASC NULLS FIRST LIMIT 1;

   
   
   
   
   
   
   
   
   -- Scratch
   


-- Testing Domain Selection
SELECT "domain"
FROM meta.domains
WHERE crawl = TRUE 
ORDER BY last_crawl_at ASC 
LIMIT 1;



-- Select counts from delete_from_staging_url
SELECT COUNT(*) 
FROM staging.urls u 
WHERE created_at >= NOW() - INTERVAL '30 minutes';









SELECT * FROM staging.urls
WHERE url LIKE 'https://cms.gov/CCIIO/Programs-and-Initiatives/Premium-Stabiliza%';






-- Fix Staging.URLs

		-- Copy doc urls to doc_urls
			INSERT INTO staging.doc_urls (url, source_url, python_uuid)
	    SELECT url, source_url, python_uuid
	    FROM staging.urls
		  WHERE url LIKE '%.pdf'
		   OR url LIKE '%.doc'
		   OR url LIKE '%.docx'
		   OR url LIKE '%.ppt'
		   OR url LIKE '%.pptx'
		   OR url LIKE '%.xls'
		   OR url LIKE '%.xlsx'
		   OR url LIKE '%.xml'
		   OR url LIKE '%.csv'
		   OR url LIKE '%.zip'
		   OR url LIKE '%.pages'
	    ON CONFLICT (url) DO UPDATE SET
	        source_url = EXCLUDED.source_url,
	        python_uuid = EXCLUDED.python_uuid;

	
						-- Delete docs from urls
						DELETE FROM staging.urls 
						WHERE url LIKE '%.pdf' OR url LIKE '%.doc' OR url LIKE '%.docx' OR 
						      url LIKE '%.ppt' OR url LIKE '%.pptx' OR url LIKE '%.xls' OR 
						      url LIKE '%.xlsx' OR url LIKE '%.xml' OR url LIKE '%.csv' OR 
						      url LIKE '%.zip' OR url LIKE '%.pages';
	
						     
						     
						     
		-- Copy Image URLs to image_urls
		INSERT INTO staging.image_urls (url, source_url, python_uuid)
		SELECT url, source_url, python_uuid
		FROM staging.urls
		WHERE url LIKE '%.jpg'
		   OR url LIKE '%.jpeg'
		   OR url LIKE '%.png'
		   OR url LIKE '%.gif'
		   OR url LIKE '%.svg'
		   OR url LIKE '%.bmp'
		ON CONFLICT (url) DO UPDATE SET
		    source_url = EXCLUDED.source_url,
		    python_uuid = EXCLUDED.python_uuid;
		  
						  	-- Delete images from urls
						  	DELETE FROM staging.urls
							WHERE url LIKE '%.jpg'
							   OR url LIKE '%.jpeg'
							   OR url LIKE '%.png'
							   OR url LIKE '%.gif'
							   OR url LIKE '%.svg'
							   OR url LIKE '%.bmp';
		  
		  
		  
SELECT count(*)
	    FROM staging.urls
		  WHERE url LIKE '%.pdf'
		   OR url LIKE '%.doc'
		   OR url LIKE '%.docx'
		   OR url LIKE '%.ppt'
		   OR url LIKE '%.pptx'
		   OR url LIKE '%.xls'
		   OR url LIKE '%.xlsx'
		   OR url LIKE '%.xml'
		   OR url LIKE '%.csv'
		   OR url LIKE '%.zip'
		   OR url LIKE '%.pages';