From 9f8a7cc836113e3d6b652eab228c8642e779b66b Mon Sep 17 00:00:00 2001 From: IMXEren <96839938+IMXEren@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:17:47 +0530 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Scraper=20improvements=20(#488)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 +- src/downloader/apkmonk.py | 2 +- src/downloader/apkpure.py | 128 ++++++++++++++++++++++++++++++++++++-- src/downloader/apksos.py | 4 +- src/downloader/factory.py | 3 +- src/downloader/sources.py | 5 +- 6 files changed, 131 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 5f898ab..f3aaa44 100644 --- a/README.md +++ b/README.md @@ -235,8 +235,8 @@ You can use any of the following methods to build. 1. Link Format - https://apksos.com/download-app/ 2. Example Link - https://apksos.com/download-app/com.expensemanager 4. APKPURE - Supports downloading only latest version - 1. Link Format - https://d.apkpure.com/b/APK/?version=latest - 2. Example Link - https://d.apkpure.com/b/APK/com.google.android.youtube?version=latest + 1. Link Format - https://apkpure.net/-/ + 2. Example Link - https://apkpure.net/-/com.google.android.youtube 5. APKMonk - Supports downloading any available version 1. Link Format - https://www.apkmonk.com/app// 2. Example Link - https://www.apkmonk.com/app// diff --git a/src/downloader/apkmonk.py b/src/downloader/apkmonk.py index 92b93a0..25ec363 100644 --- a/src/downloader/apkmonk.py +++ b/src/downloader/apkmonk.py @@ -28,7 +28,7 @@ class ApkMonk(Downloader): handle_request_response(r, page) soup = BeautifulSoup(r.text, bs4_parser) download_scripts = soup.find_all("script", type="text/javascript") - key_value_pattern = r'\{"pkg":"([^"]+)","key":"([^"]+)"\}' + key_value_pattern = r"pkg=([^&]+)&key=([^']+)" url = None for script in download_scripts: if match := re.search(key_value_pattern, script.text): diff --git a/src/downloader/apkpure.py b/src/downloader/apkpure.py index c9f8867..3198711 100644 --- a/src/downloader/apkpure.py +++ b/src/downloader/apkpure.py @@ -2,19 +2,135 @@ from typing import Any, Self +import requests +from bs4 import BeautifulSoup +from loguru import logger + from src.app import APP from src.downloader.download import Downloader +from src.exceptions import APKPureAPKDownloadError +from src.utils import bs4_parser, handle_request_response, request_header, request_timeout, slugify class ApkPure(Downloader): """Files downloader.""" - def latest_version(self: Self, app: APP, **kwargs: Any) -> tuple[str, str]: - """Function to download whatever the latest version of app from apkmirror. + default_archs_priority: tuple[str, ...] = ("arm64-v8a", "armeabi-v7a", "x86_64", "x86") + + @staticmethod + def _select_preferred_dl(app: str, apk_dls: list[str], xapk_dls: list[str]) -> tuple[str | None, str | None]: + file_name = None + app_dl = None + if apk_dls: + file_name = f"{app}.apk" + app_dl = apk_dls[0] + elif xapk_dls: + file_name = f"{app}.zip" + app_dl = xapk_dls[0] + return file_name, app_dl + + def _sort_by_priority(self: Self, arch_list: list[str] | tuple[str]) -> list[str]: + """Specifically used to sort the arch list based on order of elements of default archs priority list.""" + return [darch for darch in self.default_archs_priority if darch in arch_list] + + def _compare_dls(self: Self, dl1: str, dl2: str) -> int: + """Compare two dls of same type (apk or xapk) to prioritise the archs on lower indices.""" + from urllib.parse import parse_qs, urlparse + + apk_type1 = parse_qs(urlparse(dl1).query).get("nc") + apk_type2 = parse_qs(urlparse(dl2).query).get("nc") + if apk_type1 and apk_type2: + l1 = len(apk_type1) + l2 = len(apk_type2) + # Indicates support for multiple archs, hence longer length + if l1 > l2: + return -1 + if l1 < l2: + return 1 + # Arrange based on priority list + priority = self.global_archs_priority or self.default_archs_priority + for arch in priority: + if arch in apk_type1 and arch not in apk_type2: + return -1 + if arch not in apk_type1 and arch in apk_type2: + return 1 + elif not apk_type1 and apk_type2: + return 1 + elif apk_type1 and not apk_type2: + return -1 + return 0 + + def extract_download_link(self: Self, page: str, app: str) -> tuple[str, str]: + """Function to extract the download link from apkpure download page. + + :param page: Url of the page + :param app: Name of the app + :return: Tuple of filename and app direct download link + """ + from functools import cmp_to_key + + logger.debug(f"Extracting download link from\n{page}") + r = requests.get(page, headers=request_header, timeout=request_timeout) + handle_request_response(r, page) + soup = BeautifulSoup(r.text, bs4_parser) + apks = soup.select("#version-list a.download-btn") + _apk_dls: list[str] = [] + _xapk_dls: list[str] = [] + for apk in apks: + if _apk_dl := apk.get("href"): + if "/b/XAPK/" in _apk_dl: + _xapk_dls.append(_apk_dl) # type: ignore # noqa: PGH003 + else: + _apk_dls.append(_apk_dl) # type: ignore # noqa: PGH003 + _apk_dls.sort(key=cmp_to_key(self._compare_dls)) + _xapk_dls.sort(key=cmp_to_key(self._compare_dls)) + file_name, app_dl = self._select_preferred_dl(app, _apk_dls, _xapk_dls) + if not file_name or not app_dl: + msg = f"Unable to extract link from {app} version list" + raise APKPureAPKDownloadError(msg, url=page) + if app_version := soup.select_one("span.info-sdk > span"): + self.app_version = slugify(app_version.get_text(strip=True)) + logger.info(f"Will be downloading {app}'s version {self.app_version}...") + return file_name, app_dl + + def specific_version(self: Self, app: APP, version: str) -> tuple[str, str]: + """Function to download the specified version of app from apkpure. :param app: Name of the application - :return: Version of downloaded apk + :param version: Version of the application to download + :return: Tuple of filename and app direct download link """ - file_name = f"{app.app_name}.apk" - self._download(app.download_source, file_name) - return file_name, app.download_source + self.global_archs_priority = tuple(self._sort_by_priority(app.archs_to_build)) + version_page = app.download_source + "/versions" + r = requests.get(version_page, headers=request_header, timeout=request_timeout) + handle_request_response(r, version_page) + soup = BeautifulSoup(r.text, bs4_parser) + version_box_list = soup.select("ul.ver-wrap > *") + for box in version_box_list: + if ( + (_data := box.select_one("a.ver_download_link")) + and (found_version := _data.get("data-dt-version")) + and found_version == version + ): + download_page = _data.get("href") + file_name, download_source = self.extract_download_link(download_page, app.app_name) # type: ignore # noqa: PGH003 + app.app_version = self.app_version + logger.info(f"Guessed {app.app_version} for {app.app_name}") + self._download(download_source, file_name) + return file_name, download_source + msg = f"Unable to find specific version '{version}' for {app} from version list" + raise APKPureAPKDownloadError(msg, url=version_page) + + def latest_version(self: Self, app: APP, **kwargs: Any) -> tuple[str, str]: + """Function to download whatever the latest version of app from apkpure. + + :param app: Name of the application + :return: Tuple of filename and app direct download link + """ + self.global_archs_priority = tuple(self._sort_by_priority(app.archs_to_build)) + download_page = app.download_source + "/download" + file_name, download_source = self.extract_download_link(download_page, app.app_name) + app.app_version = self.app_version + logger.info(f"Guessed {app.app_version} for {app.app_name}") + self._download(download_source, file_name) + return file_name, download_source diff --git a/src/downloader/apksos.py b/src/downloader/apksos.py index 083e8d5..2bd3e99 100644 --- a/src/downloader/apksos.py +++ b/src/downloader/apksos.py @@ -26,8 +26,10 @@ class ApkSos(Downloader): download_button = soup.find(class_="col-sm-12 col-md-8 text-center") possible_links = download_button.find_all("a") # type: ignore[union-attr] for possible_link in possible_links: - if possible_link.get("href"): + if possible_link.get("href") and (_title := possible_link.get("title")): file_name = f"{app}.apk" + if _title.endswith("Bundle"): + file_name = f"{app}.zip" self._download(possible_link["href"], file_name) return file_name, possible_link["href"] msg = f"Unable to download {app}" diff --git a/src/downloader/factory.py b/src/downloader/factory.py index 0f8436a..206b6f6 100644 --- a/src/downloader/factory.py +++ b/src/downloader/factory.py @@ -11,7 +11,6 @@ from src.downloader.google_drive import GoogleDrive from src.downloader.sources import ( APK_MIRROR_BASE_URL, APK_MONK_BASE_URL, - APK_PURE_BASE_APK_URL, APK_PURE_BASE_URL, APKS_SOS_BASE_URL, DRIVE_DOWNLOAD_BASE_URL, @@ -36,7 +35,7 @@ class DownloaderFactory(object): """ if apk_source.startswith(GITHUB_BASE_URL): return Github(config) - if apk_source.startswith((APK_PURE_BASE_URL, APK_PURE_BASE_APK_URL)): + if apk_source.startswith(APK_PURE_BASE_URL): return ApkPure(config) if apk_source.startswith(APKS_SOS_BASE_URL): return ApkSos(config) diff --git a/src/downloader/sources.py b/src/downloader/sources.py index 950d426..7ee9132 100644 --- a/src/downloader/sources.py +++ b/src/downloader/sources.py @@ -5,10 +5,9 @@ APK_MIRROR_BASE_APK_URL = f"{APK_MIRROR_BASE_URL}/apk" APK_MIRROR_PACKAGE_URL = f"{APK_MIRROR_BASE_URL}/?s=" + "{}" APK_MIRROR_APK_CHECK = f"{APK_MIRROR_BASE_URL}/wp-json/apkm/v1/app_exists/" UPTODOWN_SUFFIX = "en.uptodown.com/android" -UPTODOWN_BASE_URL = "https://{}.en.uptodown.com/android" +UPTODOWN_BASE_URL = "https://{}." + UPTODOWN_SUFFIX APK_PURE_BASE_URL = "https://apkpure.net" -APK_PURE_BASE_APK_URL = "https://d.apkpure.net/b/APK" -APK_PURE_URL = APK_PURE_BASE_APK_URL + "/{}?version=latest" +APK_PURE_URL = APK_PURE_BASE_URL + "/-/{}" APK_PURE_ICON_URL = APK_PURE_BASE_URL + "/search?q={}" APKS_SOS_BASE_URL = "https://apksos.com/download-app" APK_SOS_URL = APKS_SOS_BASE_URL + "/{}"