Scraper improvements (#488)

This commit is contained in:
IMXEren
2024-04-11 23:17:47 +05:30
committed by GitHub
parent f51e68103f
commit 9f8a7cc836
6 changed files with 131 additions and 15 deletions
+2 -2
View File
@@ -235,8 +235,8 @@ You can use any of the following methods to build.
1. Link Format - https://apksos.com/download-app/<package-name> 1. Link Format - https://apksos.com/download-app/<package-name>
2. Example Link - https://apksos.com/download-app/com.expensemanager 2. Example Link - https://apksos.com/download-app/com.expensemanager
4. APKPURE - Supports downloading only latest version 4. APKPURE - Supports downloading only latest version
1. Link Format - https://d.apkpure.com/b/APK/<package-name>?version=latest 1. Link Format - https://apkpure.net/-/<package-name>
2. Example Link - https://d.apkpure.com/b/APK/com.google.android.youtube?version=latest 2. Example Link - https://apkpure.net/-/com.google.android.youtube
5. APKMonk - Supports downloading any available version 5. APKMonk - Supports downloading any available version
1. Link Format - https://www.apkmonk.com/app/<package-name>/ 1. Link Format - https://www.apkmonk.com/app/<package-name>/
2. Example Link - https://www.apkmonk.com/app/<package-name>/ 2. Example Link - https://www.apkmonk.com/app/<package-name>/
+1 -1
View File
@@ -28,7 +28,7 @@ class ApkMonk(Downloader):
handle_request_response(r, page) handle_request_response(r, page)
soup = BeautifulSoup(r.text, bs4_parser) soup = BeautifulSoup(r.text, bs4_parser)
download_scripts = soup.find_all("script", type="text/javascript") download_scripts = soup.find_all("script", type="text/javascript")
key_value_pattern = r'\{"pkg":"([^"]+)","key":"([^"]+)"\}' key_value_pattern = r"pkg=([^&]+)&key=([^']+)"
url = None url = None
for script in download_scripts: for script in download_scripts:
if match := re.search(key_value_pattern, script.text): if match := re.search(key_value_pattern, script.text):
+122 -6
View File
@@ -2,19 +2,135 @@
from typing import Any, Self from typing import Any, Self
import requests
from bs4 import BeautifulSoup
from loguru import logger
from src.app import APP from src.app import APP
from src.downloader.download import Downloader from src.downloader.download import Downloader
from src.exceptions import APKPureAPKDownloadError
from src.utils import bs4_parser, handle_request_response, request_header, request_timeout, slugify
class ApkPure(Downloader): class ApkPure(Downloader):
"""Files downloader.""" """Files downloader."""
def latest_version(self: Self, app: APP, **kwargs: Any) -> tuple[str, str]: default_archs_priority: tuple[str, ...] = ("arm64-v8a", "armeabi-v7a", "x86_64", "x86")
"""Function to download whatever the latest version of app from apkmirror.
@staticmethod
def _select_preferred_dl(app: str, apk_dls: list[str], xapk_dls: list[str]) -> tuple[str | None, str | None]:
file_name = None
app_dl = None
if apk_dls:
file_name = f"{app}.apk"
app_dl = apk_dls[0]
elif xapk_dls:
file_name = f"{app}.zip"
app_dl = xapk_dls[0]
return file_name, app_dl
def _sort_by_priority(self: Self, arch_list: list[str] | tuple[str]) -> list[str]:
"""Specifically used to sort the arch list based on order of elements of default archs priority list."""
return [darch for darch in self.default_archs_priority if darch in arch_list]
def _compare_dls(self: Self, dl1: str, dl2: str) -> int:
"""Compare two dls of same type (apk or xapk) to prioritise the archs on lower indices."""
from urllib.parse import parse_qs, urlparse
apk_type1 = parse_qs(urlparse(dl1).query).get("nc")
apk_type2 = parse_qs(urlparse(dl2).query).get("nc")
if apk_type1 and apk_type2:
l1 = len(apk_type1)
l2 = len(apk_type2)
# Indicates support for multiple archs, hence longer length
if l1 > l2:
return -1
if l1 < l2:
return 1
# Arrange based on priority list
priority = self.global_archs_priority or self.default_archs_priority
for arch in priority:
if arch in apk_type1 and arch not in apk_type2:
return -1
if arch not in apk_type1 and arch in apk_type2:
return 1
elif not apk_type1 and apk_type2:
return 1
elif apk_type1 and not apk_type2:
return -1
return 0
def extract_download_link(self: Self, page: str, app: str) -> tuple[str, str]:
"""Function to extract the download link from apkpure download page.
:param page: Url of the page
:param app: Name of the app
:return: Tuple of filename and app direct download link
"""
from functools import cmp_to_key
logger.debug(f"Extracting download link from\n{page}")
r = requests.get(page, headers=request_header, timeout=request_timeout)
handle_request_response(r, page)
soup = BeautifulSoup(r.text, bs4_parser)
apks = soup.select("#version-list a.download-btn")
_apk_dls: list[str] = []
_xapk_dls: list[str] = []
for apk in apks:
if _apk_dl := apk.get("href"):
if "/b/XAPK/" in _apk_dl:
_xapk_dls.append(_apk_dl) # type: ignore # noqa: PGH003
else:
_apk_dls.append(_apk_dl) # type: ignore # noqa: PGH003
_apk_dls.sort(key=cmp_to_key(self._compare_dls))
_xapk_dls.sort(key=cmp_to_key(self._compare_dls))
file_name, app_dl = self._select_preferred_dl(app, _apk_dls, _xapk_dls)
if not file_name or not app_dl:
msg = f"Unable to extract link from {app} version list"
raise APKPureAPKDownloadError(msg, url=page)
if app_version := soup.select_one("span.info-sdk > span"):
self.app_version = slugify(app_version.get_text(strip=True))
logger.info(f"Will be downloading {app}'s version {self.app_version}...")
return file_name, app_dl
def specific_version(self: Self, app: APP, version: str) -> tuple[str, str]:
"""Function to download the specified version of app from apkpure.
:param app: Name of the application :param app: Name of the application
:return: Version of downloaded apk :param version: Version of the application to download
:return: Tuple of filename and app direct download link
""" """
file_name = f"{app.app_name}.apk" self.global_archs_priority = tuple(self._sort_by_priority(app.archs_to_build))
self._download(app.download_source, file_name) version_page = app.download_source + "/versions"
return file_name, app.download_source r = requests.get(version_page, headers=request_header, timeout=request_timeout)
handle_request_response(r, version_page)
soup = BeautifulSoup(r.text, bs4_parser)
version_box_list = soup.select("ul.ver-wrap > *")
for box in version_box_list:
if (
(_data := box.select_one("a.ver_download_link"))
and (found_version := _data.get("data-dt-version"))
and found_version == version
):
download_page = _data.get("href")
file_name, download_source = self.extract_download_link(download_page, app.app_name) # type: ignore # noqa: PGH003
app.app_version = self.app_version
logger.info(f"Guessed {app.app_version} for {app.app_name}")
self._download(download_source, file_name)
return file_name, download_source
msg = f"Unable to find specific version '{version}' for {app} from version list"
raise APKPureAPKDownloadError(msg, url=version_page)
def latest_version(self: Self, app: APP, **kwargs: Any) -> tuple[str, str]:
"""Function to download whatever the latest version of app from apkpure.
:param app: Name of the application
:return: Tuple of filename and app direct download link
"""
self.global_archs_priority = tuple(self._sort_by_priority(app.archs_to_build))
download_page = app.download_source + "/download"
file_name, download_source = self.extract_download_link(download_page, app.app_name)
app.app_version = self.app_version
logger.info(f"Guessed {app.app_version} for {app.app_name}")
self._download(download_source, file_name)
return file_name, download_source
+3 -1
View File
@@ -26,8 +26,10 @@ class ApkSos(Downloader):
download_button = soup.find(class_="col-sm-12 col-md-8 text-center") download_button = soup.find(class_="col-sm-12 col-md-8 text-center")
possible_links = download_button.find_all("a") # type: ignore[union-attr] possible_links = download_button.find_all("a") # type: ignore[union-attr]
for possible_link in possible_links: for possible_link in possible_links:
if possible_link.get("href"): if possible_link.get("href") and (_title := possible_link.get("title")):
file_name = f"{app}.apk" file_name = f"{app}.apk"
if _title.endswith("Bundle"):
file_name = f"{app}.zip"
self._download(possible_link["href"], file_name) self._download(possible_link["href"], file_name)
return file_name, possible_link["href"] return file_name, possible_link["href"]
msg = f"Unable to download {app}" msg = f"Unable to download {app}"
+1 -2
View File
@@ -11,7 +11,6 @@ from src.downloader.google_drive import GoogleDrive
from src.downloader.sources import ( from src.downloader.sources import (
APK_MIRROR_BASE_URL, APK_MIRROR_BASE_URL,
APK_MONK_BASE_URL, APK_MONK_BASE_URL,
APK_PURE_BASE_APK_URL,
APK_PURE_BASE_URL, APK_PURE_BASE_URL,
APKS_SOS_BASE_URL, APKS_SOS_BASE_URL,
DRIVE_DOWNLOAD_BASE_URL, DRIVE_DOWNLOAD_BASE_URL,
@@ -36,7 +35,7 @@ class DownloaderFactory(object):
""" """
if apk_source.startswith(GITHUB_BASE_URL): if apk_source.startswith(GITHUB_BASE_URL):
return Github(config) return Github(config)
if apk_source.startswith((APK_PURE_BASE_URL, APK_PURE_BASE_APK_URL)): if apk_source.startswith(APK_PURE_BASE_URL):
return ApkPure(config) return ApkPure(config)
if apk_source.startswith(APKS_SOS_BASE_URL): if apk_source.startswith(APKS_SOS_BASE_URL):
return ApkSos(config) return ApkSos(config)
+2 -3
View File
@@ -5,10 +5,9 @@ APK_MIRROR_BASE_APK_URL = f"{APK_MIRROR_BASE_URL}/apk"
APK_MIRROR_PACKAGE_URL = f"{APK_MIRROR_BASE_URL}/?s=" + "{}" APK_MIRROR_PACKAGE_URL = f"{APK_MIRROR_BASE_URL}/?s=" + "{}"
APK_MIRROR_APK_CHECK = f"{APK_MIRROR_BASE_URL}/wp-json/apkm/v1/app_exists/" APK_MIRROR_APK_CHECK = f"{APK_MIRROR_BASE_URL}/wp-json/apkm/v1/app_exists/"
UPTODOWN_SUFFIX = "en.uptodown.com/android" UPTODOWN_SUFFIX = "en.uptodown.com/android"
UPTODOWN_BASE_URL = "https://{}.en.uptodown.com/android" UPTODOWN_BASE_URL = "https://{}." + UPTODOWN_SUFFIX
APK_PURE_BASE_URL = "https://apkpure.net" APK_PURE_BASE_URL = "https://apkpure.net"
APK_PURE_BASE_APK_URL = "https://d.apkpure.net/b/APK" APK_PURE_URL = APK_PURE_BASE_URL + "/-/{}"
APK_PURE_URL = APK_PURE_BASE_APK_URL + "/{}?version=latest"
APK_PURE_ICON_URL = APK_PURE_BASE_URL + "/search?q={}" APK_PURE_ICON_URL = APK_PURE_BASE_URL + "/search?q={}"
APKS_SOS_BASE_URL = "https://apksos.com/download-app" APKS_SOS_BASE_URL = "https://apksos.com/download-app"
APK_SOS_URL = APKS_SOS_BASE_URL + "/{}" APK_SOS_URL = APKS_SOS_BASE_URL + "/{}"