init
This commit is contained in:
216
.gitignore
vendored
Normal file
216
.gitignore
vendored
Normal file
@@ -0,0 +1,216 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[codz]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
# Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
# poetry.lock
|
||||
# poetry.toml
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||
# pdm.lock
|
||||
# pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# pixi
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||
# pixi.lock
|
||||
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||
.pixi
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# Redis
|
||||
*.rdb
|
||||
*.aof
|
||||
*.pid
|
||||
|
||||
# RabbitMQ
|
||||
mnesia/
|
||||
rabbitmq/
|
||||
rabbitmq-data/
|
||||
|
||||
# ActiveMQ
|
||||
activemq-data/
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.envrc
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
# Abstra
|
||||
# Abstra is an AI-powered process automation framework.
|
||||
# Ignore directories containing user credentials, local state, and settings.
|
||||
# Learn more at https://abstra.io/docs
|
||||
.abstra/
|
||||
|
||||
# Visual Studio Code
|
||||
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||
# you could uncomment the following to ignore the entire vscode folder
|
||||
# .vscode/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
# Marimo
|
||||
marimo/_static/
|
||||
marimo/_lsp/
|
||||
__marimo__/
|
||||
|
||||
# Streamlit
|
||||
.streamlit/secrets.toml
|
||||
319
main.py
Executable file
319
main.py
Executable file
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
import base64
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import json
|
||||
import pathlib
|
||||
import ssl
|
||||
import time
|
||||
from json import JSONDecodeError
|
||||
|
||||
import mutagen
|
||||
import tqdm
|
||||
from devtools import debug
|
||||
|
||||
import urllib3
|
||||
from mutagen.id3 import ID3Tags, TALB, Encoding, TRSN, TIT2, COMM, TDRC, APIC
|
||||
from urllib3 import HTTPResponse
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
|
||||
def build_headers():
|
||||
browser_headers = {
|
||||
"requestHeaders": {
|
||||
"headers": [
|
||||
{
|
||||
"name": "Accept",
|
||||
"value": "*/*"
|
||||
},
|
||||
{
|
||||
"name": "Accept-Encoding",
|
||||
"value": "gzip, deflate, br, zstd"
|
||||
},
|
||||
{
|
||||
"name": "Accept-Language",
|
||||
"value": "et,en-US;q=0.7,en;q=0.3"
|
||||
},
|
||||
{
|
||||
"name": "Cache-Control",
|
||||
"value": "no-cache"
|
||||
},
|
||||
{
|
||||
"name": "Connection",
|
||||
"value": "keep-alive"
|
||||
},
|
||||
{
|
||||
"name": "Cookie",
|
||||
"value": "__cf_bm=kRBbuxnNp3a8SrNqVSiD72LQPCgzcrfZkFTOIRnPJ10-1761204872-1.0.1.1-LVegUjIrBUmhhnjrTJk_NvlI7dAIPLSU0dTcJnwnL6rVqB_9MCtUAAMdY.vVUouBxp9yplpkxQZbhR12SNUlLdq4rZ77cBydIdd6.p9eCZ4"
|
||||
},
|
||||
{
|
||||
"name": "Host",
|
||||
"value": "podbay.fm"
|
||||
},
|
||||
{
|
||||
"name": "Pragma",
|
||||
"value": "no-cache"
|
||||
},
|
||||
{
|
||||
"name": "Priority",
|
||||
"value": "u=4"
|
||||
},
|
||||
{
|
||||
"name": "Referer",
|
||||
"value": "https://podbay.fm/p/tramm-ja-buss"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Dest",
|
||||
"value": "empty"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Mode",
|
||||
"value": "cors"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Site",
|
||||
"value": "same-origin"
|
||||
},
|
||||
{
|
||||
"name": "User-Agent",
|
||||
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
browser_headers = {
|
||||
"requestHeaders": {
|
||||
"headers": [
|
||||
{
|
||||
"name": "Accept",
|
||||
"value": "application/json, text/plain, */*"
|
||||
},
|
||||
{
|
||||
"name": "Accept-Encoding",
|
||||
"value": "gzip, deflate, br, zstd"
|
||||
},
|
||||
{
|
||||
"name": "Accept-Language",
|
||||
"value": "en"
|
||||
},
|
||||
{
|
||||
"name": "Cache-Control",
|
||||
"value": "no-cache"
|
||||
},
|
||||
{
|
||||
"name": "Connection",
|
||||
"value": "keep-alive"
|
||||
},
|
||||
{
|
||||
"name": "Host",
|
||||
"value": "vcore-web.ivoox.com"
|
||||
},
|
||||
{
|
||||
"name": "Origin",
|
||||
"value": "https://www.ivoox.com"
|
||||
},
|
||||
{
|
||||
"name": "Pragma",
|
||||
"value": "no-cache"
|
||||
},
|
||||
{
|
||||
"name": "Referer",
|
||||
"value": "https://www.ivoox.com/"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Dest",
|
||||
"value": "empty"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Mode",
|
||||
"value": "cors"
|
||||
},
|
||||
{
|
||||
"name": "Sec-Fetch-Site",
|
||||
"value": "same-site"
|
||||
},
|
||||
{
|
||||
"name": "TE",
|
||||
"value": "trailers"
|
||||
},
|
||||
{
|
||||
"name": "User-Agent",
|
||||
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
headers = {}
|
||||
for header in browser_headers['requestHeaders']['headers']:
|
||||
headers[header['name']] = header['value']
|
||||
return headers
|
||||
|
||||
all_requests = {}
|
||||
|
||||
def load_responses():
|
||||
global all_requests
|
||||
if pathlib.Path("all_requests.json").exists():
|
||||
with open("all_requests.json", "r") as f:
|
||||
all_requests = json.load(f)
|
||||
|
||||
def save_resposnes():
|
||||
with open("all_requests.json", "w") as f:
|
||||
json.dump(all_requests, f, indent=4)
|
||||
|
||||
def get_tramm_buss_podbay():
|
||||
slug = "tramm-ja-buss"
|
||||
http = urllib3.PoolManager()
|
||||
headers = build_headers()
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = f"https://podbay.fm/api/podcast?slug={slug}&reverse=false&page={page}"
|
||||
if url in all_requests:
|
||||
j = all_requests[url]
|
||||
else:
|
||||
resp = http.request('GET', url, headers=headers)
|
||||
try:
|
||||
j = resp.json()
|
||||
except JSONDecodeError:
|
||||
debug(resp.data)
|
||||
raise
|
||||
all_requests[url] = j
|
||||
save_resposnes()
|
||||
time.sleep(1)
|
||||
|
||||
page += 1
|
||||
|
||||
if len(j['podcast']['episodes']) == 0:
|
||||
break
|
||||
|
||||
for episode in j['podcast']['episodes']:
|
||||
yield episode
|
||||
print(f"{episode["title"]}\t{episode["mediaURL"]}")
|
||||
|
||||
def get_ivoox(program_id):
|
||||
http = urllib3.PoolManager()
|
||||
headers = build_headers()
|
||||
page = 1
|
||||
while True:
|
||||
url = 'https://vcore-web.ivoox.com/v1/public/audios?filters={"program":{"eq":"'+program_id+'"}}&orders={"uploadDate":"desc"}&limit=20&page='+str(page)
|
||||
if url in all_requests:
|
||||
j = all_requests[url]
|
||||
else:
|
||||
resp = http.request('GET', url, headers=headers)
|
||||
try:
|
||||
j = resp.json()
|
||||
except JSONDecodeError:
|
||||
debug(resp.data)
|
||||
raise
|
||||
all_requests[url] = j
|
||||
save_resposnes()
|
||||
time.sleep(1)
|
||||
|
||||
page += 1
|
||||
for item in j['data']['items']:
|
||||
yield item
|
||||
|
||||
def download_episode(folder, episode):
|
||||
urllib3.disable_warnings()
|
||||
http = urllib3.PoolManager(cert_reqs=ssl.CERT_NONE)
|
||||
headers = {
|
||||
"User-Agent": "Arhiveerija zirk.me"
|
||||
}
|
||||
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
parent_folder = pathlib.Path(folder)
|
||||
parent_folder.mkdir(exist_ok=True)
|
||||
path = parent_folder / pathlib.Path(f"{published_at:%Y-%m-%d} - {episode['title']}.mp3")
|
||||
|
||||
resp: HTTPResponse = http.request(
|
||||
'GET',
|
||||
episode['mediaURL'],
|
||||
headers=headers,
|
||||
preload_content=False
|
||||
)
|
||||
|
||||
if resp.status != 200:
|
||||
debug(episode['mediaURL'], resp.status, resp.length_remaining)
|
||||
# debug(resp.status, resp.url)
|
||||
# debug(resp.headers)
|
||||
# debug(resp.data)
|
||||
return
|
||||
|
||||
with open(path, "wb") as f:
|
||||
chunk_size = 1024 * 1024
|
||||
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=120, desc=str(path)):
|
||||
f.write(chunk)
|
||||
return path
|
||||
|
||||
def get_image(image_jwt):
|
||||
meta, payload, sig = image_jwt.split(".")
|
||||
data = base64.b64decode(payload)
|
||||
j = json.loads(data)
|
||||
http = urllib3.PoolManager()
|
||||
headers = {
|
||||
"User-Agent": "Arhiveerija zirk.me"
|
||||
}
|
||||
url = j['url']
|
||||
cache_path = pathlib.Path("cache")
|
||||
cache_path.mkdir(exist_ok=True)
|
||||
cache_path_url = cache_path / str(hashlib.sha256(url.encode()).hexdigest())
|
||||
if not cache_path_url.exists():
|
||||
resp: HTTPResponse = http.request('GET', url, headers=headers, preload_content=False)
|
||||
if resp.status != 200:
|
||||
raise Exception(f"Could not download image {url}")
|
||||
with cache_path_url.open("wb") as f:
|
||||
chunk_size = 1024 * 1024
|
||||
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=80, desc=str(url)):
|
||||
f.write(chunk)
|
||||
with cache_path_url.open("rb") as f:
|
||||
return f.read()
|
||||
|
||||
def annotate_with_metadata(path: pathlib.Path, album, episode: dict):
|
||||
f = mutagen.File(path)
|
||||
if f is None:
|
||||
raise Exception(f"Could not parse {path}")
|
||||
|
||||
if not f.tags:
|
||||
f.add_tags()
|
||||
tags: ID3Tags = f.tags
|
||||
tags.add(TALB(encoding=Encoding.UTF8, text=album))
|
||||
tags.add(TIT2(encoding=Encoding.UTF8, text=episode['title']))
|
||||
if episode['description']:
|
||||
tags.add(COMM(encoding=Encoding.UTF8, lang="est", text=episode['description']))
|
||||
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
tags.add(TDRC(encoding=Encoding.UTF8, text=f"{published_at:%Y-%m-%d}"))
|
||||
tags.add(TRSN(encoding=Encoding.UTF8, text="Raadio 2"))
|
||||
if "image" in episode:
|
||||
tags.add(
|
||||
APIC(
|
||||
encoding=Encoding.UTF8,
|
||||
mime="image/jpeg",
|
||||
type=3,
|
||||
desc="Cover",
|
||||
data=get_image(episode["image"])
|
||||
)
|
||||
)
|
||||
f.save()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
load_responses()
|
||||
ivoox_tramm_ja_buss = "373524"
|
||||
ivoox_tjuun_in = "377242"
|
||||
for episode in get_ivoox(ivoox_tjuun_in):
|
||||
folder = "Tjuun In"
|
||||
published_at = datetime.datetime.strptime(episode['uploadDate'], "%Y-%m-%d %H:%M:%S")
|
||||
ep = {
|
||||
"title": episode['title'],
|
||||
"mediaURL": "https://www.ivoox.com/"+episode['mediaUrl'],
|
||||
"published": f"{published_at:%Y-%m-%dT%H:%M:%S.%fZ}",
|
||||
"description": episode['description'],
|
||||
}
|
||||
if "Erik Morna" in ep['title']:
|
||||
continue
|
||||
debug(ep)
|
||||
path = download_episode(folder, ep)
|
||||
# # #path = pathlib.Path("Tramm ja Buss/Tramm ja Buss - 2022-08-16.mp3")
|
||||
annotate_with_metadata(path, folder, ep)
|
||||
# break
|
||||
Reference in New Issue
Block a user