1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99import psycopg2
import uuid
import re
import time
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
# Postgre connection info
db_host = "localhost"
db_port = 8432
db_user = "a11yPython"
db_password = "SnakeInTheWeb"
db_name = "a11y"
# Connect to database
conn = psycopg2.connect(host=db_host, port=db_port, user=db_user, password=db_password, database=db_name)
cur = conn.cursor()
# Get domains to scrape
cur.execute("SELECT domain FROM meta.domains WHERE crawl = TRUE")
domains = cur.fetchall()
# Create the Spider
class A11ySpider(Spider):
name = "A11ySpider"
# Where to Start?
def start_requests(self):
for domain in domains:
# Generate UUID
python_uuid = uuid.uuid4()
# Insert Domain and UUID
cur.execute("INSERT INTO results.crawls (domain, python_uuid) VALUES (%s, %s)", (domain[0], str(python_uuid)))
url = "https://" + domain[0]
# time.sleep(10) # Enable to add a 10 second delay between domains
yield Request(url, callback=self.parse, meta={"domain": domain[0], "python_uuid": python_uuid})
# How are we parsing this?
def parse(self, response):
domain = response.meta["domain"] # get the domain from meta.domains
python_uuid = response.meta["python_uuid"] # get the python_uuid
source_url = response.url # get the source URL
# Get all links from the page
try:
links = response.css("a::attr(href)").getall()
except Exception as e:
self.logger.warning(f"Failed to parse {response.url} with error: {e}")
return
# Insert link to Postgres
for link in links:
if link is not None:
# filter out urls with "#" and remove trailing "/"
link = re.sub(r"#.*$", "", link).rstrip("/")
if not re.search(r'tel:|mailto:| ', link):
if not link.startswith("http"):
link = f"https://{domain}{link}"
cur.execute("""
INSERT INTO staging.urls (url, python_uuid, source_url)
VALUES (%s, %s, %s)
ON CONFLICT (url) DO NOTHING
""", (link, str(python_uuid), source_url))
conn.commit()
# self.logger.info(f"Inserted {link} into staging.urls")
# Follow links to other pages on the same domain
if link.startswith(f"https://{domain}"):
#time.sleep(1) # wait for 1 second before crawling the next page on the same domain
yield Request(link, callback=self.parse, meta={"domain": domain, "python_uuid": python_uuid, "source_url": source_url})
# Wait before crawling the next off-site link
#time.sleep(1)
# Crawler Settings
# Mote info: https://docs.scrapy.org/en/latest/topics/settings.html
process = CrawlerProcess(settings={
"BOT_NAME": "A11yCheck Bot", # Name of Bot
"DOWNLOAD_DELAY": 1, # Seconds to delay between requests
"RANDOMIZE_DOWNLOAD_DELAY": True, # Randomize DOWNLOAD_DELAY between 0.5 & 1.5x
"COOKIES_ENABLED": False, # Disable cookies
"CONCURRENT_REQUESTS": 50, # Maximum concurrent requests
"HTTPCACHE_ENABLED": False, # Disable caching
"CONCURRENT_REQUESTS_PER_DOMAIN": 16, # Maximum concurrent requests per domain
"ROBOTSTXT_OBEY": True, # Obey robots.txt rules
"AUTOTHROTTLE_ENABLED": True, # Enable AutoThrottle extension
"AUTOTHROTTLE_START_DELAY": 5, # Initial delay before AutoThrottle starts adjusting the delay
"AUTOTHROTTLE_TARGET_CONCURRENCY": 3, # Target concurrency for AutoThrottle
"LOG_LEVEL": "WARNING", # Logging level
"LOG_ENABLED": True # Enable logging
})
process.crawl(A11ySpider)
process.start()