Merge pull request #284 from nikhilbadyal/feature/280-use-beautifulsoup4-for-scrapping

🎨 Use BS4 for scrapping
This commit is contained in:
Nikhil Badyal
2023-08-15 19:45:10 +05:30
committed by GitHub
4 changed files with 90 additions and 74 deletions
-1
View File
@@ -4,5 +4,4 @@ lastversion==3.0.1
loguru==0.7.0
pre-commit==3.3.3
requests==2.31.0
selectolax==0.3.16
tqdm==4.66.1
+68 -59
View File
@@ -1,62 +1,84 @@
"""Downloader Class."""
import re
from typing import Any
import requests
from bs4 import BeautifulSoup
from loguru import logger
from selectolax.lexbor import LexborHTMLParser
from scripts.status_check import headers
from src.downloader.download import Downloader
from src.exceptions import AppNotFound
from src.utils import apkmirror_status_check, bs4_parser
class ApkMirror(Downloader):
"""Files downloader."""
def extract_download_link(self, page: str, app: str) -> None:
def _extract_force_download_link(self, link: str, app: str) -> None:
"""Extract force download link."""
r = requests.get(link, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {link} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
notes_divs = soup.find(class_="tab-pane")
possible_links = notes_divs.find_all("a")
for possible_link in possible_links:
if possible_link.get("href") and "download.php?id=" in possible_link.get(
"href"
):
return self._download(
self.config.apk_mirror + possible_link["href"], f"{app}.apk"
)
raise AppNotFound(f"Unable to download apk from {link}")
def extract_download_link(self, main_page: str, app: str) -> None:
"""Function to extract the download link from apkmirror html page.
:param page: Url of the page
:param main_page: Url of the page
:param app: Name of the app
"""
logger.debug(f"Extracting download link from\n{page}")
parser = LexborHTMLParser(self.config.session.get(page).text)
resp = self.config.session.get(
self.config.apk_mirror + parser.css_first("a.accent_bg").attributes["href"]
logger.debug(f"Extracting download link from\n{main_page}")
r = requests.get(main_page, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
download_button = soup.find(class_="center")
download_links = download_button.find_all("a")
final_download_link = None
for download_link in download_links:
if download_link.get("href"):
if "download/?key=" in download_link.get("href"):
final_download_link = download_link["href"]
break
if not final_download_link:
raise AppNotFound(f"Unable to download apk from {main_page}")
self._extract_force_download_link(
self.config.apk_mirror + final_download_link, app
)
parser = LexborHTMLParser(resp.text)
href = parser.css_first(
"p.notes:nth-child(3) > span:nth-child(1) > a:nth-child(1)"
).attributes["href"]
self._download(self.config.apk_mirror + href, f"{app}.apk")
def get_download_page(self, parser: LexborHTMLParser, main_page: str) -> str:
def get_download_page(self, main_page: str) -> str:
"""Function to get the download page in apk_mirror.
:param parser: Parser
:param main_page: Main Download Page in APK mirror(Index)
:return:
"""
logger.debug(f"Getting download page from {main_page}")
apm = parser.css(".apkm-badge")
sub_url = ""
for is_apm in apm:
parent_text = is_apm.parent.parent.text()
if "APK" in is_apm.text() and (
"arm64-v8a" in parent_text
or "universal" in parent_text
or "noarch" in parent_text
):
parser = is_apm.parent
sub_url = parser.css_first(".accent_color").attributes["href"]
r = requests.get(main_page, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
list_widget = soup.find(class_="listWidget")
table_rows = list_widget.find_all(class_="table-row")
sub_url = None
for row in table_rows:
if row.find(class_="accent_color"):
apk_type = row.find(class_="apkm-badge").get_text()
if apk_type == "APK":
sub_url = row.find(class_="accent_color")["href"]
break
if sub_url == "":
logger.exception(
f"Unable to find any apk on apkmirror_specific_version on {main_page}"
)
raise AppNotFound("Unable to find apk on apkmirror site.")
return self.config.apk_mirror + sub_url
if not sub_url:
raise AppNotFound("Unable to download apk from APKMirror.")
return f"{self.config.apk_mirror}{sub_url}"
def specific_version(self, app: str, version: str) -> None:
"""Function to download the specified version of app from apkmirror.
@@ -67,10 +89,7 @@ class ApkMirror(Downloader):
"""
version = version.replace(".", "-")
main_page = f"{self.config.apk_mirror_version_urls.get(app)}-{version}-release/"
parser = LexborHTMLParser(
self.config.session.get(main_page, allow_redirects=True).text
)
download_page = self.get_download_page(parser, main_page)
download_page = self.get_download_page(main_page)
self.extract_download_link(download_page, app)
def latest_version(self, app: str, **kwargs: Any) -> None:
@@ -80,24 +99,14 @@ class ApkMirror(Downloader):
:param app: Name of the application
:return: Version of downloaded apk
"""
logger.debug(f"Trying to download {app}'s latest version from apkmirror")
page = self.config.apk_mirror_urls.get(app)
if not page:
logger.debug("Invalid app")
raise AppNotFound("Invalid app")
parser = LexborHTMLParser(self.config.session.get(page).text)
try:
main_page = parser.css_first(".appRowVariantTag>.accent_color").attributes[
"href"
]
except AttributeError:
# Handles a case when variants are not available
main_page = parser.css_first(".downloadLink").attributes["href"]
match = re.search(r"\d", main_page)
if not match:
logger.error("Cannot find app main page")
raise AppNotFound()
main_page = f"{self.config.apk_mirror}{main_page}"
parser = LexborHTMLParser(self.config.session.get(main_page).text)
download_page = self.get_download_page(parser, main_page)
self.extract_download_link(download_page, app)
from src.patches import Patches
package_name = Patches.get_package_name(app)
response = apkmirror_status_check(package_name)
if response["data"][0]["exists"]:
version = response["data"][0]["release"]["version"]
logger.debug(
f"Trying to download {app}'s latest version({version}) from apkmirror"
)
return self.specific_version(app, version)
raise AppNotFound("App not found on apkmirror.")
+13 -9
View File
@@ -1,10 +1,13 @@
"""APK SOS Downloader Class."""
from typing import Any
from loguru import logger
from selectolax.lexbor import LexborHTMLParser
import requests
from bs4 import BeautifulSoup
from scripts.status_check import headers
from src.downloader.download import Downloader
from src.exceptions import AppNotFound
from src.utils import bs4_parser
class ApkSos(Downloader):
@@ -16,13 +19,14 @@ class ApkSos(Downloader):
:param page: Url of the page
:param app: Name of the app
"""
parser = LexborHTMLParser(self.config.session.get(page).text)
download_url = parser.css_first(
r"body > div > div > div > div > div.col-sm-12.col-md-8 > div.card.fluid.\.idma > "
"div.section.row > div.col-sm-12.col-md-8.text-center > p > a"
).attributes["href"]
self._download(download_url, f"{app}.apk")
logger.debug(f"Downloaded {app} apk from apk_combo_downloader in rt")
r = requests.get(page, headers=headers, allow_redirects=True)
soup = BeautifulSoup(r.text, bs4_parser)
download_button = soup.find(class_="col-sm-12 col-md-8 text-center")
possible_links = download_button.find_all("a")
for possible_link in possible_links:
if possible_link.get("href"):
return self._download(possible_link["href"], f"{app}.apk")
raise AppNotFound("Unable to download apk from apk_combo")
def latest_version(self, app: str, **kwargs: Any) -> None:
"""Function to download whatever the latest version of app from
+8 -4
View File
@@ -1,10 +1,11 @@
"""Upto Down Downloader."""
from typing import Any
import requests
from bs4 import BeautifulSoup
from loguru import logger
from selectolax.lexbor import LexborHTMLParser
from scripts.status_check import headers
from src.downloader.download import Downloader
from src.exceptions import AppNotFound
from src.utils import bs4_parser
@@ -14,9 +15,12 @@ class UptoDown(Downloader):
"""Files downloader."""
def extract_download_link(self, page: str, app: str) -> None:
parser = LexborHTMLParser(self.config.session.get(page).text)
main_page = parser.css_first("#detail-download-button")
download_url = main_page.attributes["data-url"]
r = requests.get(page, headers=headers, allow_redirects=True)
soup = BeautifulSoup(r.text, bs4_parser)
soup = soup.find(id="detail-download-button")
download_url = soup.get("data-url")
if not download_url:
raise AppNotFound("Unable to download from uptodown.")
self._download(download_url, f"{app}.apk")
logger.debug(f"Downloaded {app} apk from upto_down_downloader in rt")