How To Call Correct Class From URL Domain
I have been currently working on creating a web crawler where I want to call the correct class that scrapes the web elements from a given URL. Currently I have created: import sys
Solution 1:
Problem is that k.domain
returns bbc
and you wrote url = 'bbc.co.uk'
so one these solutions
- use
url = 'bbc.co.uk'
along withk.registered_domain
- use
url = 'bbc'
along withk.domain
And add a parameter in the scrape
method to get the response
from abc import abstractmethod
import requests
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.url] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain]()
@abstractmethod
def scrape(self, content: requests.Response):
pass
class BBCScraper(Scraper):
url = 'bbc.co.uk'
def scrape(self, content: requests.Response):
return "Scraped BBC News"
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape(requests.get(url))
print(r) # Scraped BBC News
Improve
I'd suggest to store the url
in a attribute to put the requests.get
in the scrape
, so there is less code in the main
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.domain] = scraper_class
@classmethod
def for_url(cls, url):
k = tldextract.extract(url)
return cls.scrapers[k.registered_domain](url)
@abstractmethod
def scrape(self):
pass
class BBCScraper(Scraper):
domain = 'bbc.co.uk'
def __init__(self, url):
self.url = url
def scrape(self):
rep = requests.Response = requests.get(self.url)
content = rep.text # ALL HTML CONTENT
return "Scraped BBC News" + content[:20]
if __name__ == "__main__":
url = 'https://www.bbc.co.uk/'
scraper = Scraper.for_url(url)
r = scraper.scrape()
print(r) # Scraped BBC News<!DOCTYPE html><html
Post a Comment for "How To Call Correct Class From URL Domain"