📦 EqualifyEverything / scan

📄 crawl.py · 99 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99import psycopg2
import uuid
import re
import time
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess

# Postgre connection info
db_host = "localhost"
db_port = 8432
db_user = "a11yPython"
db_password = "SnakeInTheWeb"
db_name = "a11y"

# Connect to database
conn = psycopg2.connect(host=db_host, port=db_port, user=db_user, password=db_password, database=db_name)
cur = conn.cursor()

# Get domains to scrape
cur.execute("SELECT domain FROM meta.domains WHERE crawl = TRUE")
domains = cur.fetchall()

# Create the Spider
class A11ySpider(Spider):
    name = "A11ySpider"


    # Where to Start?
    def start_requests(self):
        for domain in domains:
            # Generate UUID
            python_uuid = uuid.uuid4()

            # Insert Domain and UUID
            cur.execute("INSERT INTO results.crawls (domain, python_uuid) VALUES (%s, %s)", (domain[0], str(python_uuid)))

            url = "https://" + domain[0]
           # time.sleep(10)     # Enable to add a 10 second delay between domains
            yield Request(url, callback=self.parse, meta={"domain": domain[0], "python_uuid": python_uuid})

    # How are we parsing this?
    def parse(self, response):
        domain = response.meta["domain"] # get the domain from meta.domains
        python_uuid = response.meta["python_uuid"] # get the python_uuid
        source_url = response.url  # get the source URL

        # Get all links from the page
        try:
            links = response.css("a::attr(href)").getall()
        except Exception as e:
            self.logger.warning(f"Failed to parse {response.url} with error: {e}")
            return

        # Insert link to Postgres
        for link in links:
            if link is not None:
                # filter out urls with "#" and remove trailing "/"
                link = re.sub(r"#.*$", "", link).rstrip("/")
                if not re.search(r'tel:|mailto:| ', link):
                    if not link.startswith("http"):
                        link = f"https://{domain}{link}"
                    cur.execute("""
                        INSERT INTO staging.urls (url, python_uuid, source_url)
                        VALUES (%s, %s, %s)
                        ON CONFLICT (url) DO NOTHING
                    """, (link, str(python_uuid), source_url))

                    conn.commit()
                    # self.logger.info(f"Inserted {link} into staging.urls")

                    # Follow links to other pages on the same domain
                    if link.startswith(f"https://{domain}"):
                        #time.sleep(1) # wait for 1 second before crawling the next page on the same domain
                        yield Request(link, callback=self.parse, meta={"domain": domain, "python_uuid": python_uuid, "source_url": source_url})

                    # Wait before crawling the next off-site link
                    #time.sleep(1)

# Crawler Settings
# Mote info:    https://docs.scrapy.org/en/latest/topics/settings.html
process = CrawlerProcess(settings={
    "BOT_NAME": "A11yCheck Bot",            # Name of Bot
    "DOWNLOAD_DELAY": 1,                    # Seconds to delay between requests
    "RANDOMIZE_DOWNLOAD_DELAY": True,       # Randomize DOWNLOAD_DELAY between 0.5 & 1.5x
    "COOKIES_ENABLED": False,               # Disable cookies
    "CONCURRENT_REQUESTS": 50,              # Maximum concurrent requests
    "HTTPCACHE_ENABLED": False,             # Disable caching
    "CONCURRENT_REQUESTS_PER_DOMAIN": 16,   # Maximum concurrent requests per domain
    "ROBOTSTXT_OBEY": True,                 # Obey robots.txt rules
    "AUTOTHROTTLE_ENABLED": True,           # Enable AutoThrottle extension
    "AUTOTHROTTLE_START_DELAY": 5,          # Initial delay before AutoThrottle starts adjusting the delay
    "AUTOTHROTTLE_TARGET_CONCURRENCY": 3,   # Target concurrency for AutoThrottle
    "LOG_LEVEL": "WARNING",                 # Logging level
    "LOG_ENABLED": True                     # Enable logging
})

process.crawl(A11ySpider)
process.start()