1
0
This commit is contained in:
2025-11-14 16:44:12 +02:00
commit 9e2f5f0b04
2 changed files with 535 additions and 0 deletions

216
.gitignore vendored Normal file
View File

@@ -0,0 +1,216 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# Redis
*.rdb
*.aof
*.pid
# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/
# ActiveMQ
activemq-data/
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml

319
main.py Executable file
View File

@@ -0,0 +1,319 @@
#!/usr/bin/env python3
import base64
import datetime
import hashlib
import logging
import json
import pathlib
import ssl
import time
from json import JSONDecodeError
import mutagen
import tqdm
from devtools import debug
import urllib3
from mutagen.id3 import ID3Tags, TALB, Encoding, TRSN, TIT2, COMM, TDRC, APIC
from urllib3 import HTTPResponse
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def build_headers():
browser_headers = {
"requestHeaders": {
"headers": [
{
"name": "Accept",
"value": "*/*"
},
{
"name": "Accept-Encoding",
"value": "gzip, deflate, br, zstd"
},
{
"name": "Accept-Language",
"value": "et,en-US;q=0.7,en;q=0.3"
},
{
"name": "Cache-Control",
"value": "no-cache"
},
{
"name": "Connection",
"value": "keep-alive"
},
{
"name": "Cookie",
"value": "__cf_bm=kRBbuxnNp3a8SrNqVSiD72LQPCgzcrfZkFTOIRnPJ10-1761204872-1.0.1.1-LVegUjIrBUmhhnjrTJk_NvlI7dAIPLSU0dTcJnwnL6rVqB_9MCtUAAMdY.vVUouBxp9yplpkxQZbhR12SNUlLdq4rZ77cBydIdd6.p9eCZ4"
},
{
"name": "Host",
"value": "podbay.fm"
},
{
"name": "Pragma",
"value": "no-cache"
},
{
"name": "Priority",
"value": "u=4"
},
{
"name": "Referer",
"value": "https://podbay.fm/p/tramm-ja-buss"
},
{
"name": "Sec-Fetch-Dest",
"value": "empty"
},
{
"name": "Sec-Fetch-Mode",
"value": "cors"
},
{
"name": "Sec-Fetch-Site",
"value": "same-origin"
},
{
"name": "User-Agent",
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
}
]
}
}
browser_headers = {
"requestHeaders": {
"headers": [
{
"name": "Accept",
"value": "application/json, text/plain, */*"
},
{
"name": "Accept-Encoding",
"value": "gzip, deflate, br, zstd"
},
{
"name": "Accept-Language",
"value": "en"
},
{
"name": "Cache-Control",
"value": "no-cache"
},
{
"name": "Connection",
"value": "keep-alive"
},
{
"name": "Host",
"value": "vcore-web.ivoox.com"
},
{
"name": "Origin",
"value": "https://www.ivoox.com"
},
{
"name": "Pragma",
"value": "no-cache"
},
{
"name": "Referer",
"value": "https://www.ivoox.com/"
},
{
"name": "Sec-Fetch-Dest",
"value": "empty"
},
{
"name": "Sec-Fetch-Mode",
"value": "cors"
},
{
"name": "Sec-Fetch-Site",
"value": "same-site"
},
{
"name": "TE",
"value": "trailers"
},
{
"name": "User-Agent",
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
}
]
}
}
headers = {}
for header in browser_headers['requestHeaders']['headers']:
headers[header['name']] = header['value']
return headers
all_requests = {}
def load_responses():
global all_requests
if pathlib.Path("all_requests.json").exists():
with open("all_requests.json", "r") as f:
all_requests = json.load(f)
def save_resposnes():
with open("all_requests.json", "w") as f:
json.dump(all_requests, f, indent=4)
def get_tramm_buss_podbay():
slug = "tramm-ja-buss"
http = urllib3.PoolManager()
headers = build_headers()
page = 1
while True:
url = f"https://podbay.fm/api/podcast?slug={slug}&reverse=false&page={page}"
if url in all_requests:
j = all_requests[url]
else:
resp = http.request('GET', url, headers=headers)
try:
j = resp.json()
except JSONDecodeError:
debug(resp.data)
raise
all_requests[url] = j
save_resposnes()
time.sleep(1)
page += 1
if len(j['podcast']['episodes']) == 0:
break
for episode in j['podcast']['episodes']:
yield episode
print(f"{episode["title"]}\t{episode["mediaURL"]}")
def get_ivoox(program_id):
http = urllib3.PoolManager()
headers = build_headers()
page = 1
while True:
url = 'https://vcore-web.ivoox.com/v1/public/audios?filters={"program":{"eq":"'+program_id+'"}}&orders={"uploadDate":"desc"}&limit=20&page='+str(page)
if url in all_requests:
j = all_requests[url]
else:
resp = http.request('GET', url, headers=headers)
try:
j = resp.json()
except JSONDecodeError:
debug(resp.data)
raise
all_requests[url] = j
save_resposnes()
time.sleep(1)
page += 1
for item in j['data']['items']:
yield item
def download_episode(folder, episode):
urllib3.disable_warnings()
http = urllib3.PoolManager(cert_reqs=ssl.CERT_NONE)
headers = {
"User-Agent": "Arhiveerija zirk.me"
}
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
parent_folder = pathlib.Path(folder)
parent_folder.mkdir(exist_ok=True)
path = parent_folder / pathlib.Path(f"{published_at:%Y-%m-%d} - {episode['title']}.mp3")
resp: HTTPResponse = http.request(
'GET',
episode['mediaURL'],
headers=headers,
preload_content=False
)
if resp.status != 200:
debug(episode['mediaURL'], resp.status, resp.length_remaining)
# debug(resp.status, resp.url)
# debug(resp.headers)
# debug(resp.data)
return
with open(path, "wb") as f:
chunk_size = 1024 * 1024
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=120, desc=str(path)):
f.write(chunk)
return path
def get_image(image_jwt):
meta, payload, sig = image_jwt.split(".")
data = base64.b64decode(payload)
j = json.loads(data)
http = urllib3.PoolManager()
headers = {
"User-Agent": "Arhiveerija zirk.me"
}
url = j['url']
cache_path = pathlib.Path("cache")
cache_path.mkdir(exist_ok=True)
cache_path_url = cache_path / str(hashlib.sha256(url.encode()).hexdigest())
if not cache_path_url.exists():
resp: HTTPResponse = http.request('GET', url, headers=headers, preload_content=False)
if resp.status != 200:
raise Exception(f"Could not download image {url}")
with cache_path_url.open("wb") as f:
chunk_size = 1024 * 1024
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=80, desc=str(url)):
f.write(chunk)
with cache_path_url.open("rb") as f:
return f.read()
def annotate_with_metadata(path: pathlib.Path, album, episode: dict):
f = mutagen.File(path)
if f is None:
raise Exception(f"Could not parse {path}")
if not f.tags:
f.add_tags()
tags: ID3Tags = f.tags
tags.add(TALB(encoding=Encoding.UTF8, text=album))
tags.add(TIT2(encoding=Encoding.UTF8, text=episode['title']))
if episode['description']:
tags.add(COMM(encoding=Encoding.UTF8, lang="est", text=episode['description']))
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
tags.add(TDRC(encoding=Encoding.UTF8, text=f"{published_at:%Y-%m-%d}"))
tags.add(TRSN(encoding=Encoding.UTF8, text="Raadio 2"))
if "image" in episode:
tags.add(
APIC(
encoding=Encoding.UTF8,
mime="image/jpeg",
type=3,
desc="Cover",
data=get_image(episode["image"])
)
)
f.save()
if __name__ == '__main__':
load_responses()
ivoox_tramm_ja_buss = "373524"
ivoox_tjuun_in = "377242"
for episode in get_ivoox(ivoox_tjuun_in):
folder = "Tjuun In"
published_at = datetime.datetime.strptime(episode['uploadDate'], "%Y-%m-%d %H:%M:%S")
ep = {
"title": episode['title'],
"mediaURL": "https://www.ivoox.com/"+episode['mediaUrl'],
"published": f"{published_at:%Y-%m-%dT%H:%M:%S.%fZ}",
"description": episode['description'],
}
if "Erik Morna" in ep['title']:
continue
debug(ep)
path = download_episode(folder, ep)
# # #path = pathlib.Path("Tramm ja Buss/Tramm ja Buss - 2022-08-16.mp3")
annotate_with_metadata(path, folder, ep)
# break