320 lines
9.2 KiB
Python
Executable File
320 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import base64
|
|
import datetime
|
|
import hashlib
|
|
import logging
|
|
import json
|
|
import pathlib
|
|
import ssl
|
|
import time
|
|
from json import JSONDecodeError
|
|
|
|
import mutagen
|
|
import tqdm
|
|
from devtools import debug
|
|
|
|
import urllib3
|
|
from mutagen.id3 import ID3Tags, TALB, Encoding, TRSN, TIT2, COMM, TDRC, APIC
|
|
from urllib3 import HTTPResponse
|
|
|
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
def build_headers():
|
|
browser_headers = {
|
|
"requestHeaders": {
|
|
"headers": [
|
|
{
|
|
"name": "Accept",
|
|
"value": "*/*"
|
|
},
|
|
{
|
|
"name": "Accept-Encoding",
|
|
"value": "gzip, deflate, br, zstd"
|
|
},
|
|
{
|
|
"name": "Accept-Language",
|
|
"value": "et,en-US;q=0.7,en;q=0.3"
|
|
},
|
|
{
|
|
"name": "Cache-Control",
|
|
"value": "no-cache"
|
|
},
|
|
{
|
|
"name": "Connection",
|
|
"value": "keep-alive"
|
|
},
|
|
{
|
|
"name": "Cookie",
|
|
"value": "__cf_bm=kRBbuxnNp3a8SrNqVSiD72LQPCgzcrfZkFTOIRnPJ10-1761204872-1.0.1.1-LVegUjIrBUmhhnjrTJk_NvlI7dAIPLSU0dTcJnwnL6rVqB_9MCtUAAMdY.vVUouBxp9yplpkxQZbhR12SNUlLdq4rZ77cBydIdd6.p9eCZ4"
|
|
},
|
|
{
|
|
"name": "Host",
|
|
"value": "podbay.fm"
|
|
},
|
|
{
|
|
"name": "Pragma",
|
|
"value": "no-cache"
|
|
},
|
|
{
|
|
"name": "Priority",
|
|
"value": "u=4"
|
|
},
|
|
{
|
|
"name": "Referer",
|
|
"value": "https://podbay.fm/p/tramm-ja-buss"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Dest",
|
|
"value": "empty"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Mode",
|
|
"value": "cors"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Site",
|
|
"value": "same-origin"
|
|
},
|
|
{
|
|
"name": "User-Agent",
|
|
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
browser_headers = {
|
|
"requestHeaders": {
|
|
"headers": [
|
|
{
|
|
"name": "Accept",
|
|
"value": "application/json, text/plain, */*"
|
|
},
|
|
{
|
|
"name": "Accept-Encoding",
|
|
"value": "gzip, deflate, br, zstd"
|
|
},
|
|
{
|
|
"name": "Accept-Language",
|
|
"value": "en"
|
|
},
|
|
{
|
|
"name": "Cache-Control",
|
|
"value": "no-cache"
|
|
},
|
|
{
|
|
"name": "Connection",
|
|
"value": "keep-alive"
|
|
},
|
|
{
|
|
"name": "Host",
|
|
"value": "vcore-web.ivoox.com"
|
|
},
|
|
{
|
|
"name": "Origin",
|
|
"value": "https://www.ivoox.com"
|
|
},
|
|
{
|
|
"name": "Pragma",
|
|
"value": "no-cache"
|
|
},
|
|
{
|
|
"name": "Referer",
|
|
"value": "https://www.ivoox.com/"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Dest",
|
|
"value": "empty"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Mode",
|
|
"value": "cors"
|
|
},
|
|
{
|
|
"name": "Sec-Fetch-Site",
|
|
"value": "same-site"
|
|
},
|
|
{
|
|
"name": "TE",
|
|
"value": "trailers"
|
|
},
|
|
{
|
|
"name": "User-Agent",
|
|
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
headers = {}
|
|
for header in browser_headers['requestHeaders']['headers']:
|
|
headers[header['name']] = header['value']
|
|
return headers
|
|
|
|
all_requests = {}
|
|
|
|
def load_responses():
|
|
global all_requests
|
|
if pathlib.Path("all_requests.json").exists():
|
|
with open("all_requests.json", "r") as f:
|
|
all_requests = json.load(f)
|
|
|
|
def save_resposnes():
|
|
with open("all_requests.json", "w") as f:
|
|
json.dump(all_requests, f, indent=4)
|
|
|
|
def get_tramm_buss_podbay():
|
|
slug = "tramm-ja-buss"
|
|
http = urllib3.PoolManager()
|
|
headers = build_headers()
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"https://podbay.fm/api/podcast?slug={slug}&reverse=false&page={page}"
|
|
if url in all_requests:
|
|
j = all_requests[url]
|
|
else:
|
|
resp = http.request('GET', url, headers=headers)
|
|
try:
|
|
j = resp.json()
|
|
except JSONDecodeError:
|
|
debug(resp.data)
|
|
raise
|
|
all_requests[url] = j
|
|
save_resposnes()
|
|
time.sleep(1)
|
|
|
|
page += 1
|
|
|
|
if len(j['podcast']['episodes']) == 0:
|
|
break
|
|
|
|
for episode in j['podcast']['episodes']:
|
|
yield episode
|
|
print(f"{episode["title"]}\t{episode["mediaURL"]}")
|
|
|
|
def get_ivoox(program_id):
|
|
http = urllib3.PoolManager()
|
|
headers = build_headers()
|
|
page = 1
|
|
while True:
|
|
url = 'https://vcore-web.ivoox.com/v1/public/audios?filters={"program":{"eq":"'+program_id+'"}}&orders={"uploadDate":"desc"}&limit=20&page='+str(page)
|
|
if url in all_requests:
|
|
j = all_requests[url]
|
|
else:
|
|
resp = http.request('GET', url, headers=headers)
|
|
try:
|
|
j = resp.json()
|
|
except JSONDecodeError:
|
|
debug(resp.data)
|
|
raise
|
|
all_requests[url] = j
|
|
save_resposnes()
|
|
time.sleep(1)
|
|
|
|
page += 1
|
|
for item in j['data']['items']:
|
|
yield item
|
|
|
|
def download_episode(folder, episode):
|
|
urllib3.disable_warnings()
|
|
http = urllib3.PoolManager(cert_reqs=ssl.CERT_NONE)
|
|
headers = {
|
|
"User-Agent": "Arhiveerija zirk.me"
|
|
}
|
|
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
parent_folder = pathlib.Path(folder)
|
|
parent_folder.mkdir(exist_ok=True)
|
|
path = parent_folder / pathlib.Path(f"{published_at:%Y-%m-%d} - {episode['title']}.mp3")
|
|
|
|
resp: HTTPResponse = http.request(
|
|
'GET',
|
|
episode['mediaURL'],
|
|
headers=headers,
|
|
preload_content=False
|
|
)
|
|
|
|
if resp.status != 200:
|
|
debug(episode['mediaURL'], resp.status, resp.length_remaining)
|
|
# debug(resp.status, resp.url)
|
|
# debug(resp.headers)
|
|
# debug(resp.data)
|
|
return
|
|
|
|
with open(path, "wb") as f:
|
|
chunk_size = 1024 * 1024
|
|
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=120, desc=str(path)):
|
|
f.write(chunk)
|
|
return path
|
|
|
|
def get_image(image_jwt):
|
|
meta, payload, sig = image_jwt.split(".")
|
|
data = base64.b64decode(payload)
|
|
j = json.loads(data)
|
|
http = urllib3.PoolManager()
|
|
headers = {
|
|
"User-Agent": "Arhiveerija zirk.me"
|
|
}
|
|
url = j['url']
|
|
cache_path = pathlib.Path("cache")
|
|
cache_path.mkdir(exist_ok=True)
|
|
cache_path_url = cache_path / str(hashlib.sha256(url.encode()).hexdigest())
|
|
if not cache_path_url.exists():
|
|
resp: HTTPResponse = http.request('GET', url, headers=headers, preload_content=False)
|
|
if resp.status != 200:
|
|
raise Exception(f"Could not download image {url}")
|
|
with cache_path_url.open("wb") as f:
|
|
chunk_size = 1024 * 1024
|
|
for chunk in tqdm.tqdm(resp.stream(chunk_size), total=resp.length_remaining // chunk_size, unit="MB", ncols=80, desc=str(url)):
|
|
f.write(chunk)
|
|
with cache_path_url.open("rb") as f:
|
|
return f.read()
|
|
|
|
def annotate_with_metadata(path: pathlib.Path, album, episode: dict):
|
|
f = mutagen.File(path)
|
|
if f is None:
|
|
raise Exception(f"Could not parse {path}")
|
|
|
|
if not f.tags:
|
|
f.add_tags()
|
|
tags: ID3Tags = f.tags
|
|
tags.add(TALB(encoding=Encoding.UTF8, text=album))
|
|
tags.add(TIT2(encoding=Encoding.UTF8, text=episode['title']))
|
|
if episode['description']:
|
|
tags.add(COMM(encoding=Encoding.UTF8, lang="est", text=episode['description']))
|
|
published_at = datetime.datetime.strptime(episode['published'], "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
tags.add(TDRC(encoding=Encoding.UTF8, text=f"{published_at:%Y-%m-%d}"))
|
|
tags.add(TRSN(encoding=Encoding.UTF8, text="Raadio 2"))
|
|
if "image" in episode:
|
|
tags.add(
|
|
APIC(
|
|
encoding=Encoding.UTF8,
|
|
mime="image/jpeg",
|
|
type=3,
|
|
desc="Cover",
|
|
data=get_image(episode["image"])
|
|
)
|
|
)
|
|
f.save()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
load_responses()
|
|
ivoox_tramm_ja_buss = "373524"
|
|
ivoox_tjuun_in = "377242"
|
|
for episode in get_ivoox(ivoox_tjuun_in):
|
|
folder = "Tjuun In"
|
|
published_at = datetime.datetime.strptime(episode['uploadDate'], "%Y-%m-%d %H:%M:%S")
|
|
ep = {
|
|
"title": episode['title'],
|
|
"mediaURL": "https://www.ivoox.com/"+episode['mediaUrl'],
|
|
"published": f"{published_at:%Y-%m-%dT%H:%M:%S.%fZ}",
|
|
"description": episode['description'],
|
|
}
|
|
if "Erik Morna" in ep['title']:
|
|
continue
|
|
debug(ep)
|
|
path = download_episode(folder, ep)
|
|
# # #path = pathlib.Path("Tramm ja Buss/Tramm ja Buss - 2022-08-16.mp3")
|
|
annotate_with_metadata(path, folder, ep)
|
|
# break
|