init

2025-11-14 16:44:12 +02:00
commit 9e2f5f0b04
2 changed files with 535 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,216 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
--- a/main.py
+++ b/main.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+import base64
+import datetime
+import hashlib
+import logging
+import json
+import pathlib
+import ssl
+import time
+from json import JSONDecodeError
+
+import mutagen
+import tqdm
+from devtools import debug
+
+import urllib3
+from mutagen.id3 import ID3Tags, TALB, Encoding, TRSN, TIT2, COMM, TDRC, APIC
+from urllib3 import HTTPResponse
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+def build_headers():
+    browser_headers = {
+        "requestHeaders": {
+            "headers": [
+                {
+                    "name": "Accept",
+                    "value": "*/*"
+                },
+                {
+                    "name": "Accept-Encoding",
+                    "value": "gzip, deflate, br, zstd"
+                },
+                {
+                    "name": "Accept-Language",
+                    "value": "et,en-US;q=0.7,en;q=0.3"
+                },
+                {
+                    "name": "Cache-Control",
+                    "value": "no-cache"
+                },
+                {
+                    "name": "Connection",
+                    "value": "keep-alive"
+                },
+                {
+                    "name": "Cookie",
+                    "value": "__cf_bm=kRBbuxnNp3a8SrNqVSiD72LQPCgzcrfZkFTOIRnPJ10-1761204872-1.0.1.1-LVegUjIrBUmhhnjrTJk_NvlI7dAIPLSU0dTcJnwnL6rVqB_9MCtUAAMdY.vVUouBxp9yplpkxQZbhR12SNUlLdq4rZ77cBydIdd6.p9eCZ4"
+                },
+                {
+                    "name": "Host",
+                    "value": "podbay.fm"
+                },
+                {
+                    "name": "Pragma",
+                    "value": "no-cache"
+                },
+                {
+                    "name": "Priority",
+                    "value": "u=4"
+                },
+                {
+                    "name": "Referer",
+                    "value": "https://podbay.fm/p/tramm-ja-buss"
+                },
+                {
+                    "name": "Sec-Fetch-Dest",
+                    "value": "empty"
+                },
+                {
+                    "name": "Sec-Fetch-Mode",
+                    "value": "cors"
+                },
+                {
+                    "name": "Sec-Fetch-Site",
+                    "value": "same-origin"
+                },
+                {
+                    "name": "User-Agent",
+                    "value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
+                }
+            ]
+        }
+    }
+    browser_headers = {
+	"requestHeaders": {
+		"headers": [
+			{
+				"name": "Accept",
+				"value": "application/json, text/plain, */*"
+			},
+			{
+				"name": "Accept-Encoding",
+				"value": "gzip, deflate, br, zstd"
+			},
+			{
+				"name": "Accept-Language",
+				"value": "en"
+			},
+			{
+				"name": "Cache-Control",
+				"value": "no-cache"
+			},
+			{
+				"name": "Connection",
+				"value": "keep-alive"
+			},
+			{
+				"name": "Host",
+				"value": "vcore-web.ivoox.com"
+			},
+			{
+				"name": "Origin",
+				"value": "https://www.ivoox.com"
+			},
+			{
+				"name": "Pragma",
+				"value": "no-cache"
+			},
+			{
+				"name": "Referer",
+				"value": "https://www.ivoox.com/"
+			},
+			{
+				"name": "Sec-Fetch-Dest",
+				"value": "empty"
+			},
+			{
+				"name": "Sec-Fetch-Mode",
+				"value": "cors"
+			},
+			{
+				"name": "Sec-Fetch-Site",
+				"value": "same-site"
+			},
+			{
+				"name": "TE",
+				"value": "trailers"
+			},
+			{
+				"name": "User-Agent",
+				"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
+			}
+		]
+	}
+}
+    headers = {}
+    for header in browser_headers['requestHeaders']['headers']:
+        headers[header['name']] = header['value']
+    return headers
+
+all_requests = {}
+
+def load_responses():
+    global all_requests
+    if pathlib.Path("all_requests.json").exists():
+        with open("all_requests.json", "r") as f:
+            all_requests = json.load(f)
+
+def save_resposnes():
+    with open("all_requests.json", "w") as f:
+        json.dump(all_requests, f, indent=4)
+
+def get_tramm_buss_podbay():
+    slug = "tramm-ja-buss"
+    http = urllib3.PoolManager()
+    headers = build_headers()
+    page = 1
+
+    while True:
+        url = f"https://podbay.fm/api/podcast?slug={slug}&reverse=false&page={page}"
+        if url in all_requests:
+            j = all_requests[url]
+        else:
+            resp = http.request('GET', url, headers=headers)
+            try:
+                j = resp.json()
+            except JSONDecodeError:
+                debug(resp.data)
+                raise
+            all_requests[url] = j
+            save_resposnes()
+            time.sleep(1)
+
+        page += 1
+
+        if len(j['podcast']['episodes']) == 0:
+            break
+
+        for episode in j['podcast']['episodes']:
+            yield episode
+            print(f"{episode["title"]}\t{episode["mediaURL"]}")
+
+def get_ivoox(program_id):
+    http = urllib3.PoolManager()
+    headers = build_headers()
+    page = 1
+    while True:
+        url = 'https://vcore-web.ivoox.com/v1/public/audios?filters={"program":{"eq":"'+program_id+'"}}&orders={"uploadDate":"desc"}&limit=20&page='+str(page)
+        if url in all_requests:
+            j = all_requests[url]
+        else:
+            resp = http.request('GET', url, headers=headers)
+            try:
+                j = resp.json()
+            except JSONDecodeError:
+                debug(resp.data)
+                raise
+            all_requests[url] = j
+            save_resposnes()
+            time.sleep(1)
+
+        page += 1
+        for item in j['data']['items']:
+            yield item
+
+def download_episode(folder, episode):
+    urllib3.disable_warnings()
+    http = urllib3.PoolManager(cert_reqs=ssl.CERT_NONE)
+    headers = {
+        "User-Agent": "Arhiveerija zirk.me"
+    }
+    published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
+    parent_folder = pathlib.Path(folder)
+    parent_folder.mkdir(exist_ok=True)
+    path = parent_folder / pathlib.Path(f"{published_at:%Y-%m-%d} - {episode['title']}.mp3")
+
+    resp: HTTPResponse = http.request(
+        'GET',
+        episode['mediaURL'],
+        headers=headers,
+        preload_content=False
+    )
+
+    if resp.status != 200:
+        debug(episode['mediaURL'], resp.status, resp.length_remaining)
+        # debug(resp.status, resp.url)
+        # debug(resp.headers)
+        # debug(resp.data)
+        return
+
+    with open(path, "wb") as f:
+        chunk_size = 1024 * 1024
+        for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=120, desc=str(path)):
+            f.write(chunk)
+    return path
+
+def get_image(image_jwt):
+    meta, payload, sig = image_jwt.split(".")
+    data = base64.b64decode(payload)
+    j = json.loads(data)
+    http = urllib3.PoolManager()
+    headers = {
+        "User-Agent": "Arhiveerija zirk.me"
+    }
+    url = j['url']
+    cache_path = pathlib.Path("cache")
+    cache_path.mkdir(exist_ok=True)
+    cache_path_url = cache_path / str(hashlib.sha256(url.encode()).hexdigest())
+    if not cache_path_url.exists():
+        resp: HTTPResponse = http.request('GET', url, headers=headers, preload_content=False)
+        if resp.status != 200:
+            raise Exception(f"Could not download image {url}")
+        with cache_path_url.open("wb") as f:
+            chunk_size = 1024 * 1024
+            for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=80, desc=str(url)):
+                f.write(chunk)
+    with cache_path_url.open("rb") as f:
+        return f.read()
+
+def annotate_with_metadata(path: pathlib.Path, album, episode: dict):
+    f = mutagen.File(path)
+    if f is None:
+        raise Exception(f"Could not parse {path}")
+
+    if not f.tags:
+        f.add_tags()
+    tags: ID3Tags = f.tags
+    tags.add(TALB(encoding=Encoding.UTF8, text=album))
+    tags.add(TIT2(encoding=Encoding.UTF8, text=episode['title']))
+    if episode['description']:
+        tags.add(COMM(encoding=Encoding.UTF8, lang="est", text=episode['description']))
+    published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
+    tags.add(TDRC(encoding=Encoding.UTF8, text=f"{published_at:%Y-%m-%d}"))
+    tags.add(TRSN(encoding=Encoding.UTF8, text="Raadio 2"))
+    if "image" in episode:
+        tags.add(
+            APIC(
+                encoding=Encoding.UTF8,
+                mime="image/jpeg",
+                type=3,
+                desc="Cover",
+                data=get_image(episode["image"])
+            )
+        )
+    f.save()
+
+
+
+if __name__ == '__main__':
+    load_responses()
+    ivoox_tramm_ja_buss = "373524"
+    ivoox_tjuun_in = "377242"
+    for episode in get_ivoox(ivoox_tjuun_in):
+        folder = "Tjuun In"
+        published_at = datetime.datetime.strptime(episode['uploadDate'], "%Y-%m-%d %H:%M:%S")
+        ep = {
+            "title": episode['title'],
+            "mediaURL": "https://www.ivoox.com/"+episode['mediaUrl'],
+            "published": f"{published_at:%Y-%m-%dT%H:%M:%S.%fZ}",
+            "description": episode['description'],
+        }
+        if "Erik Morna" in ep['title']:
+            continue
+        debug(ep)
+        path = download_episode(folder, ep)
+        # # #path = pathlib.Path("Tramm ja Buss/Tramm ja Buss - 2022-08-16.mp3")
+        annotate_with_metadata(path, folder, ep)
+        # break