🎨 Use BS4 for scrapping APKMirror

This commit is contained in:
Nikhil Badyal
2023-08-15 15:32:09 +05:30
parent 45e7f01b5d
commit 9571861cd0
+69 -60
View File
@@ -1,62 +1,84 @@
"""Downloader Class.""" """Downloader Class."""
import re
from typing import Any from typing import Any
import requests
from bs4 import BeautifulSoup
from loguru import logger from loguru import logger
from selectolax.lexbor import LexborHTMLParser
from scripts.status_check import headers
from src.downloader.download import Downloader from src.downloader.download import Downloader
from src.exceptions import AppNotFound from src.exceptions import AppNotFound
from src.utils import apkmirror_status_check, bs4_parser
class ApkMirror(Downloader): class ApkMirror(Downloader):
"""Files downloader.""" """Files downloader."""
def extract_download_link(self, page: str, app: str) -> None: def _extract_force_download_link(self, link: str, app: str) -> None:
"""Extract force download link."""
r = requests.get(link, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {link} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
notes_divs = soup.find(class_="tab-pane")
possible_links = notes_divs.find_all("a")
for possible_link in possible_links:
if possible_link.get("href") and "download.php?id=" in possible_link.get(
"href"
):
return self._download(
self.config.apk_mirror + possible_link["href"], f"{app}.apk"
)
raise AppNotFound(f"Unable to download apk from {link}")
def extract_download_link(self, main_page: str, app: str) -> None:
"""Function to extract the download link from apkmirror html page. """Function to extract the download link from apkmirror html page.
:param page: Url of the page :param main_page: Url of the page
:param app: Name of the app :param app: Name of the app
""" """
logger.debug(f"Extracting download link from\n{page}") logger.debug(f"Extracting download link from\n{main_page}")
parser = LexborHTMLParser(self.config.session.get(page).text) r = requests.get(main_page, headers=headers)
if r.status_code != 200:
resp = self.config.session.get( raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
self.config.apk_mirror + parser.css_first("a.accent_bg").attributes["href"] soup = BeautifulSoup(r.text, bs4_parser)
download_button = soup.find(class_="center")
download_links = download_button.find_all("a")
final_download_link = None
for download_link in download_links:
if download_link.get("href"):
if "download/?key=" in download_link.get("href"):
final_download_link = download_link["href"]
break
if not final_download_link:
raise AppNotFound(f"Unable to download apk from {main_page}")
self._extract_force_download_link(
self.config.apk_mirror + final_download_link, app
) )
parser = LexborHTMLParser(resp.text)
href = parser.css_first( def get_download_page(self, main_page: str) -> str:
"p.notes:nth-child(3) > span:nth-child(1) > a:nth-child(1)"
).attributes["href"]
self._download(self.config.apk_mirror + href, f"{app}.apk")
def get_download_page(self, parser: LexborHTMLParser, main_page: str) -> str:
"""Function to get the download page in apk_mirror. """Function to get the download page in apk_mirror.
:param parser: Parser :param parser: Parser
:param main_page: Main Download Page in APK mirror(Index) :param main_page: Main Download Page in APK mirror(Index)
:return: :return:
""" """
logger.debug(f"Getting download page from {main_page}") r = requests.get(main_page, headers=headers)
apm = parser.css(".apkm-badge") if r.status_code != 200:
sub_url = "" raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
for is_apm in apm: soup = BeautifulSoup(r.text, bs4_parser)
parent_text = is_apm.parent.parent.text() list_widget = soup.find(class_="listWidget")
if "APK" in is_apm.text() and ( table_rows = list_widget.find_all(class_="table-row")
"arm64-v8a" in parent_text sub_url = None
or "universal" in parent_text for row in table_rows:
or "noarch" in parent_text if row.find(class_="accent_color"):
): apk_type = row.find(class_="apkm-badge").get_text()
parser = is_apm.parent if apk_type == "APK":
sub_url = parser.css_first(".accent_color").attributes["href"] sub_url = row.find(class_="accent_color")["href"]
break break
if sub_url == "": if not sub_url:
logger.exception( raise AppNotFound("Unable to download apk from APKMirror.")
f"Unable to find any apk on apkmirror_specific_version on {main_page}" return f"{self.config.apk_mirror}{sub_url}"
)
raise AppNotFound("Unable to find apk on apkmirror site.")
return self.config.apk_mirror + sub_url
def specific_version(self, app: str, version: str) -> None: def specific_version(self, app: str, version: str) -> None:
"""Function to download the specified version of app from apkmirror. """Function to download the specified version of app from apkmirror.
@@ -67,10 +89,7 @@ class ApkMirror(Downloader):
""" """
version = version.replace(".", "-") version = version.replace(".", "-")
main_page = f"{self.config.apk_mirror_version_urls.get(app)}-{version}-release/" main_page = f"{self.config.apk_mirror_version_urls.get(app)}-{version}-release/"
parser = LexborHTMLParser( download_page = self.get_download_page(main_page)
self.config.session.get(main_page, allow_redirects=True).text
)
download_page = self.get_download_page(parser, main_page)
self.extract_download_link(download_page, app) self.extract_download_link(download_page, app)
def latest_version(self, app: str, **kwargs: Any) -> None: def latest_version(self, app: str, **kwargs: Any) -> None:
@@ -80,24 +99,14 @@ class ApkMirror(Downloader):
:param app: Name of the application :param app: Name of the application
:return: Version of downloaded apk :return: Version of downloaded apk
""" """
logger.debug(f"Trying to download {app}'s latest version from apkmirror") from src.patches import Patches
page = self.config.apk_mirror_urls.get(app)
if not page: package_name = Patches.get_package_name(app)
logger.debug("Invalid app") response = apkmirror_status_check(package_name)
raise AppNotFound("Invalid app") if response["data"][0]["exists"]:
parser = LexborHTMLParser(self.config.session.get(page).text) version = response["data"][0]["release"]["version"]
try: logger.debug(
main_page = parser.css_first(".appRowVariantTag>.accent_color").attributes[ f"Trying to download {app}'s latest version({version}) from apkmirror"
"href" )
] return self.specific_version(app, version)
except AttributeError: raise AppNotFound("App not found on apkmirror.")
# Handles a case when variants are not available
main_page = parser.css_first(".downloadLink").attributes["href"]
match = re.search(r"\d", main_page)
if not match:
logger.error("Cannot find app main page")
raise AppNotFound()
main_page = f"{self.config.apk_mirror}{main_page}"
parser = LexborHTMLParser(self.config.session.get(main_page).text)
download_page = self.get_download_page(parser, main_page)
self.extract_download_link(download_page, app)