🎨 Use BS4 for scrapping APKMirror

This commit is contained in:
Nikhil Badyal
2023-08-15 15:32:09 +05:30
parent 45e7f01b5d
commit 9571861cd0
+69 -60
View File
@@ -1,62 +1,84 @@
"""Downloader Class."""
import re
from typing import Any
import requests
from bs4 import BeautifulSoup
from loguru import logger
from selectolax.lexbor import LexborHTMLParser
from scripts.status_check import headers
from src.downloader.download import Downloader
from src.exceptions import AppNotFound
from src.utils import apkmirror_status_check, bs4_parser
class ApkMirror(Downloader):
"""Files downloader."""
def extract_download_link(self, page: str, app: str) -> None:
def _extract_force_download_link(self, link: str, app: str) -> None:
"""Extract force download link."""
r = requests.get(link, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {link} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
notes_divs = soup.find(class_="tab-pane")
possible_links = notes_divs.find_all("a")
for possible_link in possible_links:
if possible_link.get("href") and "download.php?id=" in possible_link.get(
"href"
):
return self._download(
self.config.apk_mirror + possible_link["href"], f"{app}.apk"
)
raise AppNotFound(f"Unable to download apk from {link}")
def extract_download_link(self, main_page: str, app: str) -> None:
"""Function to extract the download link from apkmirror html page.
:param page: Url of the page
:param main_page: Url of the page
:param app: Name of the app
"""
logger.debug(f"Extracting download link from\n{page}")
parser = LexborHTMLParser(self.config.session.get(page).text)
resp = self.config.session.get(
self.config.apk_mirror + parser.css_first("a.accent_bg").attributes["href"]
logger.debug(f"Extracting download link from\n{main_page}")
r = requests.get(main_page, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
download_button = soup.find(class_="center")
download_links = download_button.find_all("a")
final_download_link = None
for download_link in download_links:
if download_link.get("href"):
if "download/?key=" in download_link.get("href"):
final_download_link = download_link["href"]
break
if not final_download_link:
raise AppNotFound(f"Unable to download apk from {main_page}")
self._extract_force_download_link(
self.config.apk_mirror + final_download_link, app
)
parser = LexborHTMLParser(resp.text)
href = parser.css_first(
"p.notes:nth-child(3) > span:nth-child(1) > a:nth-child(1)"
).attributes["href"]
self._download(self.config.apk_mirror + href, f"{app}.apk")
def get_download_page(self, parser: LexborHTMLParser, main_page: str) -> str:
def get_download_page(self, main_page: str) -> str:
"""Function to get the download page in apk_mirror.
:param parser: Parser
:param main_page: Main Download Page in APK mirror(Index)
:return:
"""
logger.debug(f"Getting download page from {main_page}")
apm = parser.css(".apkm-badge")
sub_url = ""
for is_apm in apm:
parent_text = is_apm.parent.parent.text()
if "APK" in is_apm.text() and (
"arm64-v8a" in parent_text
or "universal" in parent_text
or "noarch" in parent_text
):
parser = is_apm.parent
sub_url = parser.css_first(".accent_color").attributes["href"]
break
if sub_url == "":
logger.exception(
f"Unable to find any apk on apkmirror_specific_version on {main_page}"
)
raise AppNotFound("Unable to find apk on apkmirror site.")
return self.config.apk_mirror + sub_url
r = requests.get(main_page, headers=headers)
if r.status_code != 200:
raise AppNotFound(f"Unable to connect with {main_page} on ApkMirror.")
soup = BeautifulSoup(r.text, bs4_parser)
list_widget = soup.find(class_="listWidget")
table_rows = list_widget.find_all(class_="table-row")
sub_url = None
for row in table_rows:
if row.find(class_="accent_color"):
apk_type = row.find(class_="apkm-badge").get_text()
if apk_type == "APK":
sub_url = row.find(class_="accent_color")["href"]
break
if not sub_url:
raise AppNotFound("Unable to download apk from APKMirror.")
return f"{self.config.apk_mirror}{sub_url}"
def specific_version(self, app: str, version: str) -> None:
"""Function to download the specified version of app from apkmirror.
@@ -67,10 +89,7 @@ class ApkMirror(Downloader):
"""
version = version.replace(".", "-")
main_page = f"{self.config.apk_mirror_version_urls.get(app)}-{version}-release/"
parser = LexborHTMLParser(
self.config.session.get(main_page, allow_redirects=True).text
)
download_page = self.get_download_page(parser, main_page)
download_page = self.get_download_page(main_page)
self.extract_download_link(download_page, app)
def latest_version(self, app: str, **kwargs: Any) -> None:
@@ -80,24 +99,14 @@ class ApkMirror(Downloader):
:param app: Name of the application
:return: Version of downloaded apk
"""
logger.debug(f"Trying to download {app}'s latest version from apkmirror")
page = self.config.apk_mirror_urls.get(app)
if not page:
logger.debug("Invalid app")
raise AppNotFound("Invalid app")
parser = LexborHTMLParser(self.config.session.get(page).text)
try:
main_page = parser.css_first(".appRowVariantTag>.accent_color").attributes[
"href"
]
except AttributeError:
# Handles a case when variants are not available
main_page = parser.css_first(".downloadLink").attributes["href"]
match = re.search(r"\d", main_page)
if not match:
logger.error("Cannot find app main page")
raise AppNotFound()
main_page = f"{self.config.apk_mirror}{main_page}"
parser = LexborHTMLParser(self.config.session.get(main_page).text)
download_page = self.get_download_page(parser, main_page)
self.extract_download_link(download_page, app)
from src.patches import Patches
package_name = Patches.get_package_name(app)
response = apkmirror_status_check(package_name)
if response["data"][0]["exists"]:
version = response["data"][0]["release"]["version"]
logger.debug(
f"Trying to download {app}'s latest version({version}) from apkmirror"
)
return self.specific_version(app, version)
raise AppNotFound("App not found on apkmirror.")