parser: reuse session for attachments, deduplicate news methods, remove unnecessary async

parent 0d2af83a
......@@ -17,6 +17,11 @@ class BaseParser:
resp.raise_for_status()
return await resp.text(encoding=encoding)
async def get_bytes(self, url: str) -> bytes:
async with self.session.get(url) as resp:
resp.raise_for_status()
return await resp.read()
class NewsInfo:
def __init__(self, client: BaseParser):
......@@ -25,33 +30,28 @@ class NewsInfo:
async def news_urls(self) -> models.NewsURL:
return await urls_parser(self.client)
async def bugs(self) -> models.BugsModel:
url = (await self.news_urls()).bugs
async def _get_packages(self, branch: str) -> models.PackagesModel | None:
url = getattr(await self.news_urls(), branch, None)
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await bugs_parser(html, url)
return await packages_parser(html, url, self.client)
async def sisyphus(self) -> models.PackagesModel | None:
url = (await self.news_urls()).sisyphus
async def bugs(self) -> models.BugsModel | None:
url = (await self.news_urls()).bugs
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url)
return await bugs_parser(html, url)
async def sisyphus(self) -> models.PackagesModel | None:
return await self._get_packages("sisyphus")
async def p11(self) -> models.PackagesModel | None:
url = (await self.news_urls()).p11
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url)
return await self._get_packages("p11")
async def p10(self) -> models.PackagesModel | None:
url = (await self.news_urls()).p10
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url)
return await self._get_packages("p10")
class PackagesInfo:
......@@ -61,7 +61,7 @@ class PackagesInfo:
async def ftbfs(self) -> List[models.FTBFSModel]:
url = "https://git.altlinux.org/beehive/stats/Sisyphus-x86_64/ftbfs-joined"
text = await self.client.get(url)
return await ftbfs_parser(text)
return ftbfs_parser(text)
async def watch_by_maintainer(
self,
......@@ -71,7 +71,7 @@ class PackagesInfo:
url = f"https://watch.altlinux.org/pub/watch/{by_acl}/{maintainer_nickname}.txt"
try:
text = await self.client.get(url)
return await watch_parser(text)
return watch_parser(text)
except:
return []
......
......@@ -28,7 +28,7 @@ async def bugs_parser(html: str, url: str):
current_bug = None
description_buffer = ""
section_name = await _get_bug_section_name(line)
section_name = _get_bug_section_name(line)
continue
bug_match = bug_pattern.match(line)
......@@ -64,7 +64,7 @@ async def bugs_parser(html: str, url: str):
return models.BugsModel(**data)
async def _get_bug_section_name(line: str) -> str:
def _get_bug_section_name(line: str) -> str:
line = line.lower()
if "new" in line and "resolved" in line:
return "quickly_resolved"
......
import aiohttp
from bs4 import BeautifulSoup
import re
import io
......@@ -6,7 +5,7 @@ import gzip
from .. import models
async def packages_parser(html: str, url: str):
async def packages_parser(html: str, url: str, client=None):
soup = BeautifulSoup(html, "html.parser")
pre_tag = soup.find("pre")
if not pre_tag:
......@@ -15,9 +14,11 @@ async def packages_parser(html: str, url: str):
pre_text = pre_tag.get_text(strip=True)
if "Было удалено вложение" in pre_text and "attachment" in pre_text:
attachment_link = pre_tag.find("a", href=True)
if attachment_link:
if attachment_link and client:
attachment_url = attachment_link["href"]
text = await _fetch(attachment_url)
compressed_data = await client.get_bytes(attachment_url)
with gzip.GzipFile(fileobj=io.BytesIO(compressed_data)) as gz:
text = gz.read().decode('utf-8')
else:
return models.PackagesModel(**{"added": [], "removed": [], "updated": [], "url": "none"})
else:
......@@ -71,7 +72,7 @@ async def packages_parser(html: str, url: str):
current_package = {
"name": match.group(1),
"description": await _clean_description(match.group(2)),
"description": _clean_description(match.group(2)),
}
seen_changelog = False
continue
......@@ -100,19 +101,8 @@ async def packages_parser(html: str, url: str):
return models.PackagesModel(**sections)
async def _clean_description(desc: str):
def _clean_description(desc: str):
desc = desc.strip()
desc = re.sub(r'\s+', ' ', desc)
desc = re.sub(r'\[\d+[KMG]?\]', '', desc).strip()
return desc
async def _fetch(url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
response.raise_for_status()
compressed_data = await response.read()
with gzip.GzipFile(fileobj=io.BytesIO(compressed_data)) as gz:
text = gz.read().decode('utf-8')
return text
from .. import models
async def ftbfs_parser(text: str):
def ftbfs_parser(text: str):
packages = []
for line in text.strip().splitlines():
parts = line.split('\t')
......
from .. import models
async def watch_parser(text: str):
def watch_parser(text: str):
return [
models.WatchByMaintainerModel(
pkg_name=parts[0],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment