📦 EqualifyEverything / crawler

📄 crawl.py · 207 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207import psycopg2
import uuid
import re
import time
import logging
from lxml import etree
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from twisted.internet.error import DNSLookupError, TCPTimedOutError


# Set up logger: "A11yLogger"
logger = logging.getLogger("A11y🪵 ")

# Check if logger already has handlers
if not logger.hasHandlers():
    logger.setLevel(logging.DEBUG)

    # Create console handler and set level to info
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    # Create formatter and add it to the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - [%(levelname)s] - %(message)s')
    ch.setFormatter(formatter)

    # Add the console handler to the logger
    logger.addHandler(ch)


# Postgres connection info
db_host = os.environ.get("DB_HOST", "localhost")
db_port = int(os.environ.get("DB_PORT", "8432"))
db_user = os.environ.get("DB_USER", "a11yPython")
db_password = os.environ.get("DB_PASSWORD", "SnakeInTheWeb")
db_name = os.environ.get("DB_NAME", "a11y")


# TODO: Create API endpoint POST /crawl
# TODO: Enable default API key of CrawlTheWorld if the docker env var of API_KEY is not set



# Connect to database
conn = psycopg2.connect(host=db_host, port=db_port, user=db_user, password=db_password, database=db_name)
cur = conn.cursor()

# Get a single domain to scrape
cur.execute("SELECT domain, last_crawl_at FROM meta.domains WHERE crawl = TRUE AND active = TRUE ORDER BY last_crawl_at ASC NULLS FIRST LIMIT 1")
domain = cur.fetchone()
print(f"Selected {domain} to crawl next")  # add this line to print the selected domain
if not domain:
    # No active domains left to crawl
    exit()

# Update crawl status for selected domain
cur.execute("UPDATE meta.domains SET crawl = FALSE, last_crawl_at = now() WHERE domain = %s", (domain[0],))

# TODO: Create function to crawl sitemaps
# TODO: Enable sitemap detection. If the URL to crawl is a sitemap, then limit URLs captured to those on the sitemap
# TODO: Enable auto-handling of parent/child sitemaps

class A11ySpider(Spider):
    name = "A11ySpider"

    # Lets ROLL
    def start_requests(self):
        # Generate UUID
        python_uuid = uuid.uuid4()

        # Insert domain and UUID
        cur.execute("INSERT INTO results.crawls (domain, python_uuid) VALUES (%s, %s)", (domain[0], str(python_uuid)))
        conn.commit()  # commit the transaction
        self.logger.info(f"Created crawl for {domain} with python_uuid of: {python_uuid}")

        url = "https://" + domain[0]
        yield Request(url, callback=self.parse, meta={"domain": domain[0], "python_uuid": python_uuid}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}, errback=self.errback)


    # Parse All The Things!
    def parse(self, response):
        domain = response.meta["domain"] # Get the domain from meta.domains
        python_uuid = response.meta["python_uuid"] # Get the python_uuid
        source_url = response.url  # Get the source URL

        # Get all links from the page
        links = response.xpath("//a/@href").getall()

        # Insert link to Postgres
        for link in links:
            if link is not None:
                # Filter out URLs with "#" and remove trailing "/"
                link = re.sub(r"#.*$", "", link).rstrip("/")
                if not re.search(r'tel:|mailto:| ', link):
                    # Determine the file type of the URL based on its extension
                    # OPTIMIZE: Sorting of response urls
                    # IDEA: Break database functions into own script
                    file_extension = link.split(".")[-1].lower()
                    if file_extension in ["pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "xml", "csv", "zip", "pages"]:
                        # Add document URLs to the staging.doc_urls table
                        cur.execute("""
                            INSERT INTO staging.doc_urls (url, source_url, python_uuid)
                            VALUES (%s, %s, %s)
                            ON CONFLICT (url) DO NOTHING
                        """, (link, source_url, str(python_uuid)))
                        conn.commit()  # commit the transaction
                        logger.debug(f"Doc: +1 URL to staging.doc_urls table: {link}")
                    elif file_extension in ["jpg", "jpeg", "png", "gif", "svg", "bmp"]:
                        # Add image URLs to the staging.image_urls table
                        cur.execute("""
                            INSERT INTO staging.image_urls (url, source_url, python_uuid)
                            VALUES (%s, %s, %s)
                            ON CONFLICT (url) DO NOTHING
                        """, (link, source_url, str(python_uuid)))
                        logger.debug(f"Image: +1 URL to staging.image_urls table: {link}")
                        conn.commit()  # commit the transaction
                    else:
                        # Add URLs that do not fit into either of the above to the staging.urls table
                        if not link.startswith("http"):
                            if link.startswith("/"):
                                link = f"https://{domain}{link}"
                            else:
                                link = f"https://{domain}/{link}"
                        cur.execute("""
                            INSERT INTO staging.urls (url, python_uuid, source_url)
                            VALUES (%s, %s, %s)
                            ON CONFLICT (url) DO NOTHING
                        """, (link, str(python_uuid), source_url))
                        conn.commit()  # commit the transaction
                        logger.debug(f"URL: +1 URL to staging.urls table: {link}")


                        # Follow links to other pages on the same domain
                        if link.startswith(f"https://{domain}"):
                            yield Request(link, callback=self.parse, meta={"domain": domain, "python_uuid": python_uuid, "source_url": source_url})

        # Check if the response is a Request object and it didn't follow any link
        if isinstance(response, Request) and not response.follows:
            cur.execute("UPDATE results.crawls SET complete = TRUE WHERE python_uuid = %s", (str(python_uuid),))
            cur.execute("UPDATE meta.domains SET crawl = TRUE, last_crawl_at = now(), last_crawl_uuid = %s WHERE domain = %s", (str(python_uuid), domain[0]))
            conn.commit()
            logger.info(f"Crawl Complete for: {domain} ")
            cur.execute("SELECT process_staging_urls()")
            logger.info(f"Staging URLs Processed, Moving to Next Domain :)")

    # Bringing errBack!
    def errback(self, failure):
        url = failure.request.url
        python_uuid = str(failure.request.meta["python_uuid"])
        source_url = failure.request.meta.get("source_url")
# BROKEN: Adding to broken_urls table
# TODO: Add to problem_urls table
# TODO: Consolidate problem and bad urls tables into problem_urls

        # Add naughty URLs to the staging.bad_urls table
        if not url.startswith("http"):
            if url.startswith("/"):
                url = f"https://{domain}{url}"
            else:
                url = f"https://{domain}/{url}"
        cur.execute("""
            INSERT INTO staging.bad_urls (url, python_uuid, source_url)
            VALUES (%s, %s, %s)
            ON CONFLICT (url) DO NOTHING
        """, (url, python_uuid, source_url))
        conn.commit()  # commit the transaction
        logger.warning(f"Bad URL: Added 1 URL to staging.bad_urls table: {link}")

        # Stalk links to other pages on the same domain
        if url.startswith(f"https://{domain}"):
            yield Request(url, callback=self.parse, meta={"domain": domain, "python_uuid": python_uuid, "source_url": source_url}, errback=self.errback)



# Crawler Settings
# Mote info:    https://docs.scrapy.org/en/latest/topics/settings.html
#
# Testing out Autothrottle: https://docs.scrapy.org/en/latest/topics/autothrottle.html#autothrottle-algorithm
# REVIEW: Settings need cleaning up
# IDEA: Settings via Docker Env Vars
# IDEA: Settings via new API Endpoint PUT?
process = CrawlerProcess(settings={
    "BOT_NAME": "A11yCheck Bot",            # Name of Bot
    #"DOWNLOAD_DELAY": 1,                   # Minimum seconds to delay between requests
    #"RANDOMIZE_DOWNLOAD_DELAY": True,      # Randomize DOWNLOAD_DELAY between 0.5 & 1.5x
    "COOKIES_ENABLED": False,               # Disable cookies
    "CONCURRENT_ITEMS": 50,                 # Number of concurrent items (per response) to process
    "CONCURRENT_REQUESTS": 16,              # Maximum concurrent requests
    #"DEPTH_LIMIT": 3,                      # Max depth that will be crawled. 0 for no limit
    "DNSCACHE_ENABLED": True,               # Enable DNS in-memory cache
    "DNS_TIMEOUT": 60,                      # Timeout for processing DNS queries
    "HTTPCACHE_ENABLED": False,             # Enable or disable caching
    "CONCURRENT_REQUESTS_PER_DOMAIN": 16,   # Maximum concurrent requests per domain
    "ROBOTSTXT_OBEY": True,                 # Obey robots.txt rules
    "AUTOTHROTTLE_ENABLED": True,           # Enable AutoThrottle extension
    "AUTOTHROTTLE_START_DELAY": 5,          # Initial delay before AutoThrottle starts adjusting the delay
    "AUTOTHROTTLE_TARGET_CONCURRENCY": 2,   # Target concurrency for AutoThrottle
    # Logging Settings
    "AUTOTHROTTLE_DEBUG": False,             # Debug logs on Autothrottle
    "LOG_LEVEL": "INFO",                   # Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL
    #"LOG_FILE": "logs/crawl.log",          # Where to save lovs
    "LOG_ENABLED": True                     # Enable logging
})

process.crawl(A11ySpider)
process.start()