Repository: ytdl-org/youtube-dl Branch: master Commit: 956b8c585591 Files: 980 Total size: 6.6 MB Directory structure: gitextract_gmvmbzf8/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── 1_broken_site.md │ │ ├── 2_site_support_request.md │ │ ├── 3_site_feature_request.md │ │ ├── 4_bug_report.md │ │ ├── 5_feature_request.md │ │ ├── 6_question.md │ │ └── config.yml │ ├── ISSUE_TEMPLATE_tmpl/ │ │ ├── 1_broken_site.md │ │ ├── 2_site_support_request.md │ │ ├── 3_site_feature_request.md │ │ ├── 4_bug_report.md │ │ └── 5_feature_request.md │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ └── ci.yml ├── .gitignore ├── AUTHORS ├── CONTRIBUTING.md ├── ChangeLog ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── devscripts/ │ ├── SizeOfImage.patch │ ├── SizeOfImage_w.patch │ ├── __init__.py │ ├── bash-completion.in │ ├── bash-completion.py │ ├── buildserver.py │ ├── check-porn.py │ ├── cli_to_api.py │ ├── create-github-release.py │ ├── fish-completion.in │ ├── fish-completion.py │ ├── generate_aes_testdata.py │ ├── gh-pages/ │ │ ├── add-version.py │ │ ├── generate-download.py │ │ ├── sign-versions.py │ │ ├── update-copyright.py │ │ ├── update-feed.py │ │ └── update-sites.py │ ├── lazy_load_template.py │ ├── make_contributing.py │ ├── make_issue_template.py │ ├── make_lazy_extractors.py │ ├── make_readme.py │ ├── make_supportedsites.py │ ├── posix-locale.sh │ ├── prepare_manpage.py │ ├── release.sh │ ├── run_tests.bat │ ├── run_tests.sh │ ├── show-downloads-statistics.py │ ├── utils.py │ ├── wine-py2exe.sh │ ├── zsh-completion.in │ └── zsh-completion.py ├── docs/ │ ├── .gitignore │ ├── Makefile │ ├── conf.py │ ├── index.rst │ ├── module_guide.rst │ └── supportedsites.md ├── setup.cfg ├── setup.py ├── test/ │ ├── __init__.py │ ├── helper.py │ ├── parameters.json │ ├── swftests/ │ │ ├── .gitignore │ │ ├── ArrayAccess.as │ │ ├── ClassCall.as │ │ ├── ClassConstruction.as │ │ ├── ConstArrayAccess.as │ │ ├── ConstantInt.as │ │ ├── DictCall.as │ │ ├── EqualsOperator.as │ │ ├── LocalVars.as │ │ ├── MemberAssignment.as │ │ ├── NeOperator.as │ │ ├── PrivateCall.as │ │ ├── PrivateVoidCall.as │ │ ├── StaticAssignment.as │ │ ├── StaticRetrieval.as │ │ ├── StringBasics.as │ │ ├── StringCharCodeAt.as │ │ └── StringConversion.as │ ├── test_InfoExtractor.py │ ├── test_YoutubeDL.py │ ├── test_YoutubeDLCookieJar.py │ ├── test_aes.py │ ├── test_age_restriction.py │ ├── test_all_urls.py │ ├── test_cache.py │ ├── test_compat.py │ ├── test_download.py │ ├── test_downloader_external.py │ ├── test_downloader_http.py │ ├── test_execution.py │ ├── test_http.py │ ├── test_iqiyi_sdk_interpreter.py │ ├── test_jsinterp.py │ ├── test_netrc.py │ ├── test_options.py │ ├── test_postprocessors.py │ ├── test_socks.py │ ├── test_subtitles.py │ ├── test_swfinterp.py │ ├── test_traversal.py │ ├── test_unicode_literals.py │ ├── test_update.py │ ├── test_utils.py │ ├── test_verbose_output.py │ ├── test_write_annotations.py │ ├── test_youtube_lists.py │ ├── test_youtube_misc.py │ ├── test_youtube_signature.py │ ├── testcert.pem │ ├── testdata/ │ │ ├── cookies/ │ │ │ ├── httponly_cookies.txt │ │ │ ├── malformed_cookies.txt │ │ │ └── session_cookies.txt │ │ ├── f4m/ │ │ │ └── custom_base_url.f4m │ │ ├── m3u8/ │ │ │ ├── pluzz_francetv_11507.m3u8 │ │ │ ├── teamcoco_11995.m3u8 │ │ │ ├── ted_18923.m3u8 │ │ │ ├── toggle_mobile_12211.m3u8 │ │ │ ├── twitch_vod.m3u8 │ │ │ └── vidio.m3u8 │ │ ├── mpd/ │ │ │ ├── float_duration.mpd │ │ │ ├── range_only.mpd │ │ │ ├── subtitles.mpd │ │ │ ├── unfragmented.mpd │ │ │ ├── url_and_range.mpd │ │ │ └── urls_only.mpd │ │ └── xspf/ │ │ └── foo_xspf.xspf │ └── versions.json ├── tox.ini ├── youtube-dl.plugin.zsh └── youtube_dl/ ├── YoutubeDL.py ├── __init__.py ├── __main__.py ├── aes.py ├── cache.py ├── casefold.py ├── compat.py ├── downloader/ │ ├── __init__.py │ ├── common.py │ ├── dash.py │ ├── external.py │ ├── f4m.py │ ├── fragment.py │ ├── hls.py │ ├── http.py │ ├── ism.py │ ├── niconico.py │ ├── rtmp.py │ └── rtsp.py ├── extractor/ │ ├── __init__.py │ ├── abc.py │ ├── abcnews.py │ ├── abcotvs.py │ ├── academicearth.py │ ├── acast.py │ ├── adn.py │ ├── adobeconnect.py │ ├── adobepass.py │ ├── adobetv.py │ ├── adultswim.py │ ├── aenetworks.py │ ├── afreecatv.py │ ├── airmozilla.py │ ├── aliexpress.py │ ├── aljazeera.py │ ├── allocine.py │ ├── alphaporno.py │ ├── alsace20tv.py │ ├── amara.py │ ├── amcnetworks.py │ ├── americastestkitchen.py │ ├── amp.py │ ├── animeondemand.py │ ├── anvato.py │ ├── aol.py │ ├── apa.py │ ├── aparat.py │ ├── appleconnect.py │ ├── applepodcasts.py │ ├── appletrailers.py │ ├── archiveorg.py │ ├── arcpublishing.py │ ├── ard.py │ ├── arkena.py │ ├── arnes.py │ ├── arte.py │ ├── asiancrush.py │ ├── atresplayer.py │ ├── atttechchannel.py │ ├── atvat.py │ ├── audimedia.py │ ├── audioboom.py │ ├── audiomack.py │ ├── awaan.py │ ├── aws.py │ ├── azmedien.py │ ├── baidu.py │ ├── bandaichannel.py │ ├── bandcamp.py │ ├── bbc.py │ ├── beatport.py │ ├── beeg.py │ ├── behindkink.py │ ├── bellmedia.py │ ├── bet.py │ ├── bfi.py │ ├── bfmtv.py │ ├── bibeltv.py │ ├── bigflix.py │ ├── bigo.py │ ├── bild.py │ ├── bilibili.py │ ├── biobiochiletv.py │ ├── biqle.py │ ├── bitchute.py │ ├── bleacherreport.py │ ├── blerp.py │ ├── bloomberg.py │ ├── bokecc.py │ ├── bongacams.py │ ├── bostonglobe.py │ ├── box.py │ ├── bpb.py │ ├── br.py │ ├── bravotv.py │ ├── breakcom.py │ ├── brightcove.py │ ├── businessinsider.py │ ├── buzzfeed.py │ ├── byutv.py │ ├── c56.py │ ├── caffeine.py │ ├── callin.py │ ├── camdemy.py │ ├── cammodels.py │ ├── camtube.py │ ├── camwithher.py │ ├── canalc2.py │ ├── canalplus.py │ ├── canvas.py │ ├── carambatv.py │ ├── cartoonnetwork.py │ ├── cbc.py │ ├── cbs.py │ ├── cbsinteractive.py │ ├── cbslocal.py │ ├── cbsnews.py │ ├── cbssports.py │ ├── ccc.py │ ├── ccma.py │ ├── cctv.py │ ├── cda.py │ ├── ceskatelevize.py │ ├── channel9.py │ ├── charlierose.py │ ├── chaturbate.py │ ├── chilloutzone.py │ ├── chirbit.py │ ├── cinchcast.py │ ├── cinemax.py │ ├── ciscolive.py │ ├── cjsw.py │ ├── clipchamp.py │ ├── cliphunter.py │ ├── clippit.py │ ├── cliprs.py │ ├── clipsyndicate.py │ ├── closertotruth.py │ ├── cloudflarestream.py │ ├── cloudy.py │ ├── clubic.py │ ├── clyp.py │ ├── cmt.py │ ├── cnbc.py │ ├── cnn.py │ ├── comedycentral.py │ ├── common.py │ ├── commonmistakes.py │ ├── commonprotocols.py │ ├── condenast.py │ ├── contv.py │ ├── corus.py │ ├── coub.py │ ├── cpac.py │ ├── cracked.py │ ├── crackle.py │ ├── crooksandliars.py │ ├── crunchyroll.py │ ├── cspan.py │ ├── ctsnews.py │ ├── ctv.py │ ├── ctvnews.py │ ├── cultureunplugged.py │ ├── curiositystream.py │ ├── cwtv.py │ ├── dailymail.py │ ├── dailymotion.py │ ├── daum.py │ ├── dbtv.py │ ├── dctp.py │ ├── deezer.py │ ├── defense.py │ ├── democracynow.py │ ├── dfb.py │ ├── dhm.py │ ├── digg.py │ ├── digiteka.py │ ├── discovery.py │ ├── discoverygo.py │ ├── discoverynetworks.py │ ├── discoveryvr.py │ ├── disney.py │ ├── dispeak.py │ ├── dlf.py │ ├── dlive.py │ ├── dotsub.py │ ├── douyutv.py │ ├── dplay.py │ ├── drbonanza.py │ ├── dreisat.py │ ├── dropbox.py │ ├── drtuber.py │ ├── drtv.py │ ├── dtube.py │ ├── dumpert.py │ ├── dvtv.py │ ├── dw.py │ ├── eagleplatform.py │ ├── ebaumsworld.py │ ├── echomsk.py │ ├── egghead.py │ ├── ehow.py │ ├── eighttracks.py │ ├── einthusan.py │ ├── eitb.py │ ├── ellentube.py │ ├── elpais.py │ ├── embedly.py │ ├── engadget.py │ ├── epidemicsound.py │ ├── eporner.py │ ├── eroprofile.py │ ├── escapist.py │ ├── espn.py │ ├── esri.py │ ├── europa.py │ ├── expotv.py │ ├── expressen.py │ ├── extractors.py │ ├── extremetube.py │ ├── eyedotv.py │ ├── facebook.py │ ├── faz.py │ ├── fc2.py │ ├── fczenit.py │ ├── fifa.py │ ├── filmon.py │ ├── filmweb.py │ ├── firsttv.py │ ├── fivemin.py │ ├── fivetv.py │ ├── flickr.py │ ├── folketinget.py │ ├── footyroom.py │ ├── formula1.py │ ├── fourtube.py │ ├── fox.py │ ├── fox9.py │ ├── foxgay.py │ ├── foxnews.py │ ├── foxsports.py │ ├── franceculture.py │ ├── franceinter.py │ ├── francetv.py │ ├── freesound.py │ ├── freespeech.py │ ├── freshlive.py │ ├── frontendmasters.py │ ├── fujitv.py │ ├── funimation.py │ ├── funk.py │ ├── fusion.py │ ├── gaia.py │ ├── gameinformer.py │ ├── gamespot.py │ ├── gamestar.py │ ├── gaskrank.py │ ├── gazeta.py │ ├── gbnews.py │ ├── gdcvault.py │ ├── gedidigital.py │ ├── generic.py │ ├── gfycat.py │ ├── giantbomb.py │ ├── giga.py │ ├── gigya.py │ ├── glide.py │ ├── globalplayer.py │ ├── globo.py │ ├── go.py │ ├── godtube.py │ ├── golem.py │ ├── googledrive.py │ ├── googlepodcasts.py │ ├── googlesearch.py │ ├── goshgay.py │ ├── gputechconf.py │ ├── groupon.py │ ├── hbo.py │ ├── hearthisat.py │ ├── heise.py │ ├── hellporno.py │ ├── helsinki.py │ ├── hentaistigma.py │ ├── hgtv.py │ ├── hidive.py │ ├── historicfilms.py │ ├── hitbox.py │ ├── hitrecord.py │ ├── hketv.py │ ├── hornbunny.py │ ├── hotnewhiphop.py │ ├── hotstar.py │ ├── howcast.py │ ├── howstuffworks.py │ ├── hrfernsehen.py │ ├── hrti.py │ ├── huajiao.py │ ├── huffpost.py │ ├── hungama.py │ ├── hypem.py │ ├── ign.py │ ├── iheart.py │ ├── imdb.py │ ├── imggaming.py │ ├── imgur.py │ ├── ina.py │ ├── inc.py │ ├── indavideo.py │ ├── infoq.py │ ├── instagram.py │ ├── internazionale.py │ ├── internetvideoarchive.py │ ├── iprima.py │ ├── iqiyi.py │ ├── ir90tv.py │ ├── itv.py │ ├── ivi.py │ ├── ivideon.py │ ├── iwara.py │ ├── izlesene.py │ ├── jamendo.py │ ├── jeuxvideo.py │ ├── joj.py │ ├── jove.py │ ├── jwplatform.py │ ├── kakao.py │ ├── kaltura.py │ ├── kankan.py │ ├── karaoketv.py │ ├── karrierevideos.py │ ├── keezmovies.py │ ├── ketnet.py │ ├── khanacademy.py │ ├── kickstarter.py │ ├── kinja.py │ ├── kinopoisk.py │ ├── kommunetv.py │ ├── konserthusetplay.py │ ├── krasview.py │ ├── kth.py │ ├── ku6.py │ ├── kusi.py │ ├── kuwo.py │ ├── la7.py │ ├── laola1tv.py │ ├── lbry.py │ ├── lci.py │ ├── lcp.py │ ├── lecture2go.py │ ├── lecturio.py │ ├── leeco.py │ ├── lego.py │ ├── lemonde.py │ ├── lenta.py │ ├── libraryofcongress.py │ ├── libsyn.py │ ├── lifenews.py │ ├── limelight.py │ ├── line.py │ ├── linkedin.py │ ├── linuxacademy.py │ ├── litv.py │ ├── livejournal.py │ ├── livestream.py │ ├── lnkgo.py │ ├── localnews8.py │ ├── lovehomeporn.py │ ├── lrt.py │ ├── lynda.py │ ├── m6.py │ ├── mailru.py │ ├── malltv.py │ ├── mangomolo.py │ ├── manyvids.py │ ├── maoritv.py │ ├── markiza.py │ ├── massengeschmacktv.py │ ├── matchtv.py │ ├── mdr.py │ ├── medaltv.py │ ├── medialaan.py │ ├── mediaset.py │ ├── mediasite.py │ ├── medici.py │ ├── megaphone.py │ ├── meipai.py │ ├── melonvod.py │ ├── meta.py │ ├── metacafe.py │ ├── metacritic.py │ ├── mgoon.py │ ├── mgtv.py │ ├── miaopai.py │ ├── microsoftvirtualacademy.py │ ├── minds.py │ ├── ministrygrid.py │ ├── minoto.py │ ├── miomio.py │ ├── mit.py │ ├── mitele.py │ ├── mixcloud.py │ ├── mlb.py │ ├── mnet.py │ ├── moevideo.py │ ├── mofosex.py │ ├── mojvideo.py │ ├── morningstar.py │ ├── motherless.py │ ├── motorsport.py │ ├── movieclips.py │ ├── moviezine.py │ ├── movingimage.py │ ├── msn.py │ ├── mtv.py │ ├── muenchentv.py │ ├── mwave.py │ ├── mychannels.py │ ├── myspace.py │ ├── myspass.py │ ├── myvi.py │ ├── myvideoge.py │ ├── myvidster.py │ ├── nationalgeographic.py │ ├── naver.py │ ├── nba.py │ ├── nbc.py │ ├── ndr.py │ ├── ndtv.py │ ├── nerdcubed.py │ ├── neteasemusic.py │ ├── netzkino.py │ ├── newgrounds.py │ ├── newstube.py │ ├── nextmedia.py │ ├── nexx.py │ ├── nfl.py │ ├── nhk.py │ ├── nhl.py │ ├── nick.py │ ├── niconico.py │ ├── ninecninemedia.py │ ├── ninegag.py │ ├── ninenow.py │ ├── nintendo.py │ ├── njpwworld.py │ ├── nobelprize.py │ ├── nonktube.py │ ├── noovo.py │ ├── normalboots.py │ ├── nosvideo.py │ ├── nova.py │ ├── nowness.py │ ├── noz.py │ ├── npo.py │ ├── npr.py │ ├── nrk.py │ ├── nrl.py │ ├── ntvcojp.py │ ├── ntvde.py │ ├── ntvru.py │ ├── nuevo.py │ ├── nuvid.py │ ├── nytimes.py │ ├── nzz.py │ ├── odatv.py │ ├── odnoklassniki.py │ ├── oktoberfesttv.py │ ├── once.py │ ├── ondemandkorea.py │ ├── onet.py │ ├── onionstudios.py │ ├── ooyala.py │ ├── openload.py │ ├── ora.py │ ├── orf.py │ ├── outsidetv.py │ ├── packtpub.py │ ├── palcomp3.py │ ├── pandoratv.py │ ├── parliamentliveuk.py │ ├── patreon.py │ ├── pbs.py │ ├── pearvideo.py │ ├── peekvids.py │ ├── peertube.py │ ├── people.py │ ├── performgroup.py │ ├── periscope.py │ ├── philharmoniedeparis.py │ ├── phoenix.py │ ├── photobucket.py │ ├── picarto.py │ ├── piksel.py │ ├── pinkbike.py │ ├── pinterest.py │ ├── pladform.py │ ├── platzi.py │ ├── playfm.py │ ├── playplustv.py │ ├── plays.py │ ├── playstuff.py │ ├── playtvak.py │ ├── playvid.py │ ├── playwire.py │ ├── pluralsight.py │ ├── podomatic.py │ ├── pokemon.py │ ├── polskieradio.py │ ├── popcorntimes.py │ ├── popcorntv.py │ ├── porn91.py │ ├── porncom.py │ ├── pornhd.py │ ├── pornhub.py │ ├── pornotube.py │ ├── pornovoisines.py │ ├── pornoxo.py │ ├── pr0gramm.py │ ├── presstv.py │ ├── prosiebensat1.py │ ├── puhutv.py │ ├── puls4.py │ ├── pyvideo.py │ ├── qqmusic.py │ ├── r7.py │ ├── radiobremen.py │ ├── radiocanada.py │ ├── radiode.py │ ├── radiofrance.py │ ├── radiojavan.py │ ├── rai.py │ ├── raywenderlich.py │ ├── rbgtum.py │ ├── rbmaradio.py │ ├── rds.py │ ├── redbulltv.py │ ├── reddit.py │ ├── redtube.py │ ├── regiotv.py │ ├── rentv.py │ ├── restudy.py │ ├── reuters.py │ ├── reverbnation.py │ ├── rice.py │ ├── rmcdecouverte.py │ ├── ro220.py │ ├── rockstargames.py │ ├── roosterteeth.py │ ├── rottentomatoes.py │ ├── roxwel.py │ ├── rozhlas.py │ ├── rtbf.py │ ├── rte.py │ ├── rtl2.py │ ├── rtlnl.py │ ├── rtp.py │ ├── rts.py │ ├── rtve.py │ ├── rtvnh.py │ ├── rtvs.py │ ├── ruhd.py │ ├── rumble.py │ ├── rutube.py │ ├── rutv.py │ ├── ruutu.py │ ├── ruv.py │ ├── s4c.py │ ├── safari.py │ ├── samplefocus.py │ ├── sapo.py │ ├── savefrom.py │ ├── sbs.py │ ├── screencast.py │ ├── screencastomatic.py │ ├── scrippsnetworks.py │ ├── scte.py │ ├── seeker.py │ ├── senateisvp.py │ ├── sendtonews.py │ ├── servus.py │ ├── sevenplus.py │ ├── sexu.py │ ├── seznamzpravy.py │ ├── shahid.py │ ├── shared.py │ ├── showroomlive.py │ ├── simplecast.py │ ├── sina.py │ ├── sixplay.py │ ├── sky.py │ ├── skyit.py │ ├── skylinewebcams.py │ ├── skynewsarabia.py │ ├── slideshare.py │ ├── slideslive.py │ ├── slutload.py │ ├── snotr.py │ ├── sohu.py │ ├── sonyliv.py │ ├── soundcloud.py │ ├── soundgasm.py │ ├── southpark.py │ ├── spankbang.py │ ├── spankwire.py │ ├── spiegel.py │ ├── spike.py │ ├── sport5.py │ ├── sportbox.py │ ├── sportdeutschland.py │ ├── spotify.py │ ├── spreaker.py │ ├── springboardplatform.py │ ├── sprout.py │ ├── srgssr.py │ ├── srmediathek.py │ ├── stanfordoc.py │ ├── steam.py │ ├── stitcher.py │ ├── storyfire.py │ ├── streamable.py │ ├── streamcloud.py │ ├── streamcz.py │ ├── streamsb.py │ ├── streetvoice.py │ ├── stretchinternet.py │ ├── stv.py │ ├── sunporno.py │ ├── sverigesradio.py │ ├── svt.py │ ├── swrmediathek.py │ ├── syfy.py │ ├── sztvhu.py │ ├── tagesschau.py │ ├── tass.py │ ├── tbs.py │ ├── tdslifeway.py │ ├── teachable.py │ ├── teachertube.py │ ├── teachingchannel.py │ ├── teamcoco.py │ ├── teamtreehouse.py │ ├── techtalks.py │ ├── ted.py │ ├── tele13.py │ ├── tele5.py │ ├── telebruxelles.py │ ├── telecinco.py │ ├── telegraaf.py │ ├── telemb.py │ ├── telequebec.py │ ├── teletask.py │ ├── telewebion.py │ ├── tennistv.py │ ├── tenplay.py │ ├── testurl.py │ ├── tf1.py │ ├── tfo.py │ ├── theintercept.py │ ├── theplatform.py │ ├── thescene.py │ ├── thestar.py │ ├── thesun.py │ ├── theweatherchannel.py │ ├── thisamericanlife.py │ ├── thisav.py │ ├── thisoldhouse.py │ ├── thisvid.py │ ├── threeqsdn.py │ ├── tiktok.py │ ├── tinypic.py │ ├── tmz.py │ ├── tnaflix.py │ ├── toggle.py │ ├── tonline.py │ ├── toongoggles.py │ ├── toutv.py │ ├── toypics.py │ ├── traileraddict.py │ ├── trilulilu.py │ ├── trovo.py │ ├── trunews.py │ ├── trutv.py │ ├── tube8.py │ ├── tubitv.py │ ├── tudou.py │ ├── tumblr.py │ ├── tunein.py │ ├── tunepk.py │ ├── turbo.py │ ├── turner.py │ ├── tv2.py │ ├── tv2dk.py │ ├── tv2hu.py │ ├── tv4.py │ ├── tv5mondeplus.py │ ├── tv5unis.py │ ├── tva.py │ ├── tvanouvelles.py │ ├── tvc.py │ ├── tver.py │ ├── tvigle.py │ ├── tvland.py │ ├── tvn24.py │ ├── tvnet.py │ ├── tvnoe.py │ ├── tvnow.py │ ├── tvp.py │ ├── tvplay.py │ ├── tvplayer.py │ ├── tweakers.py │ ├── twentyfourvideo.py │ ├── twentymin.py │ ├── twentythreevideo.py │ ├── twitcasting.py │ ├── twitch.py │ ├── twitter.py │ ├── udemy.py │ ├── udn.py │ ├── ufctv.py │ ├── uktvplay.py │ ├── umg.py │ ├── unistra.py │ ├── unity.py │ ├── uol.py │ ├── uplynk.py │ ├── urort.py │ ├── urplay.py │ ├── usanetwork.py │ ├── usatoday.py │ ├── ustream.py │ ├── ustudio.py │ ├── varzesh3.py │ ├── vbox7.py │ ├── veehd.py │ ├── veoh.py │ ├── vesti.py │ ├── vevo.py │ ├── vgtv.py │ ├── vh1.py │ ├── vice.py │ ├── vidbit.py │ ├── viddler.py │ ├── videa.py │ ├── videodetective.py │ ├── videofyme.py │ ├── videomore.py │ ├── videopress.py │ ├── vidio.py │ ├── vidlii.py │ ├── vidme.py │ ├── vier.py │ ├── viewlift.py │ ├── viidea.py │ ├── viki.py │ ├── vimeo.py │ ├── vimple.py │ ├── vine.py │ ├── viqeo.py │ ├── viu.py │ ├── vk.py │ ├── vlive.py │ ├── vodlocker.py │ ├── vodpl.py │ ├── vodplatform.py │ ├── voicerepublic.py │ ├── voot.py │ ├── voxmedia.py │ ├── vrak.py │ ├── vrt.py │ ├── vrv.py │ ├── vshare.py │ ├── vtm.py │ ├── vube.py │ ├── vuclip.py │ ├── vvvvid.py │ ├── vyborymos.py │ ├── vzaar.py │ ├── wakanim.py │ ├── walla.py │ ├── washingtonpost.py │ ├── wat.py │ ├── watchbox.py │ ├── watchindianporn.py │ ├── wdr.py │ ├── webcaster.py │ ├── webofstories.py │ ├── weibo.py │ ├── weiqitv.py │ ├── whyp.py │ ├── wistia.py │ ├── worldstarhiphop.py │ ├── wsj.py │ ├── wwe.py │ ├── xbef.py │ ├── xboxclips.py │ ├── xfileshare.py │ ├── xhamster.py │ ├── xiami.py │ ├── ximalaya.py │ ├── xminus.py │ ├── xnxx.py │ ├── xstream.py │ ├── xtube.py │ ├── xuite.py │ ├── xvideos.py │ ├── xxxymovies.py │ ├── yahoo.py │ ├── yandexdisk.py │ ├── yandexmusic.py │ ├── yandexvideo.py │ ├── yapfiles.py │ ├── yesjapan.py │ ├── yinyuetai.py │ ├── ynet.py │ ├── youjizz.py │ ├── youku.py │ ├── younow.py │ ├── youporn.py │ ├── yourporn.py │ ├── yourupload.py │ ├── youtube.py │ ├── zapiks.py │ ├── zattoo.py │ ├── zdf.py │ ├── zhihu.py │ ├── zingmp3.py │ ├── zoom.py │ └── zype.py ├── jsinterp.py ├── options.py ├── postprocessor/ │ ├── __init__.py │ ├── common.py │ ├── embedthumbnail.py │ ├── execafterdownload.py │ ├── ffmpeg.py │ ├── metadatafromtitle.py │ └── xattrpp.py ├── socks.py ├── swfinterp.py ├── traversal.py ├── update.py ├── utils.py └── version.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/1_broken_site.md ================================================ --- name: Broken site support about: Report broken or misfunctioning site title: '' --- ## Checklist - [ ] I'm reporting a broken site support - [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones ## Verbose log ``` PASTE VERBOSE LOG HERE ``` ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/2_site_support_request.md ================================================ --- name: Site support request about: Request support for a new site title: '' labels: 'site-support-request' --- ## Checklist - [ ] I'm reporting a new site support request - [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones ## Example URLs - Single video: https://www.youtube.com/watch?v=BaW_jenozKc - Single video: https://youtu.be/BaW_jenozKc - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/3_site_feature_request.md ================================================ --- name: Site feature request about: Request a new functionality for a site title: '' --- ## Checklist - [ ] I'm reporting a site feature request - [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/4_bug_report.md ================================================ --- name: Bug report about: Report a bug unrelated to any particular site or extractor title: '' --- ## Checklist - [ ] I'm reporting a broken site support issue - [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones - [ ] I've read bugs section in FAQ ## Verbose log ``` PASTE VERBOSE LOG HERE ``` ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/5_feature_request.md ================================================ --- name: Feature request about: Request a new functionality unrelated to any particular site or extractor title: '' labels: 'request' --- ## Checklist - [ ] I'm reporting a feature request - [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/6_question.md ================================================ --- name: Ask question about: Ask youtube-dl related question title: '' labels: 'question' --- ## Checklist - [ ] I'm asking a question - [ ] I've looked through the README and FAQ for similar questions - [ ] I've searched the bugtracker for similar questions including closed ones ## Question WRITE QUESTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false ================================================ FILE: .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md ================================================ --- name: Broken site support about: Report broken or misfunctioning site title: '' --- ## Checklist - [ ] I'm reporting a broken site support - [ ] I've verified that I'm running youtube-dl version **%(version)s** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones ## Verbose log ``` PASTE VERBOSE LOG HERE ``` ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md ================================================ --- name: Site support request about: Request support for a new site title: '' labels: 'site-support-request' --- ## Checklist - [ ] I'm reporting a new site support request - [ ] I've verified that I'm running youtube-dl version **%(version)s** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones ## Example URLs - Single video: https://www.youtube.com/watch?v=BaW_jenozKc - Single video: https://youtu.be/BaW_jenozKc - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md ================================================ --- name: Site feature request about: Request a new functionality for a site title: '' --- ## Checklist - [ ] I'm reporting a site feature request - [ ] I've verified that I'm running youtube-dl version **%(version)s** - [ ] I've searched the bugtracker for similar site feature requests including closed ones ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md ================================================ --- name: Bug report about: Report a bug unrelated to any particular site or extractor title: '' --- ## Checklist - [ ] I'm reporting a broken site support issue - [ ] I've verified that I'm running youtube-dl version **%(version)s** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones - [ ] I've read bugs section in FAQ ## Verbose log ``` PASTE VERBOSE LOG HERE ``` ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md ================================================ --- name: Feature request about: Request a new functionality unrelated to any particular site or extractor title: '' labels: 'request' --- ## Checklist - [ ] I'm reporting a feature request - [ ] I've verified that I'm running youtube-dl version **%(version)s** - [ ] I've searched the bugtracker for similar feature requests including closed ones ## Description WRITE DESCRIPTION HERE ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Please follow the guide below - You will be asked some questions, please read them **carefully** and answer honestly - Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x]) - Use *Preview* tab to see how your *pull request* will actually look like --- ### Before submitting a *pull request* make sure you have: - [ ] [Searched](https://github.com/ytdl-org/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests - [ ] Read [adding new extractor tutorial](https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site) - [ ] Read [youtube-dl coding conventions](https://github.com/ytdl-org/youtube-dl#youtube-dl-coding-conventions) and adjusted the code to meet them - [ ] Covered the code with tests (note that PRs without tests will be REJECTED) - [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) ### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: - [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) - [ ] I am not the original author of this code but it is in public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) ### What is the purpose of your *pull request*? - [ ] Bug fix - [ ] Improvement - [ ] New extractor - [ ] New feature --- ### Description of your *pull request* and other information Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible. ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI env: all-cpython-versions: 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10, 3.11, 3.12 main-cpython-versions: 2.7, 3.2, 3.5, 3.9, 3.11 pypy-versions: pypy-2.7, pypy-3.6, pypy-3.7 cpython-versions: main test-set: core # Python beta version to be built using pyenv before setup-python support # Must also be included in all-cpython-versions next: 3.13 on: push: # push inputs aren't known to GitHub inputs: cpython-versions: type: string default: all test-set: type: string default: core pull_request: # pull_request inputs aren't known to GitHub inputs: cpython-versions: type: string default: main test-set: type: string default: both workflow_dispatch: inputs: cpython-versions: type: choice description: CPython versions (main = 2.7, 3.2, 3.5, 3.9, 3.11) options: - all - main required: true default: main test-set: type: choice description: core, download options: - both - core - download required: true default: both permissions: contents: read jobs: select: name: Select tests from inputs runs-on: ubuntu-latest outputs: cpython-versions: ${{ steps.run.outputs.cpython-versions }} test-set: ${{ steps.run.outputs.test-set }} own-pip-versions: ${{ steps.run.outputs.own-pip-versions }} steps: # push and pull_request inputs aren't known to GitHub (pt3) - name: Set push defaults if: ${{ github.event_name == 'push' }} env: cpython-versions: all test-set: core run: | echo "cpython-versions=${{env.cpython-versions}}" >> "$GITHUB_ENV" echo "test_set=${{env.test_set}}" >> "$GITHUB_ENV" - name: Get pull_request inputs if: ${{ github.event_name == 'pull_request' }} env: cpython-versions: main test-set: both run: | echo "cpython-versions=${{env.cpython-versions}}" >> "$GITHUB_ENV" echo "test_set=${{env.test_set}}" >> "$GITHUB_ENV" - name: Make version array id: run run: | # Make a JSON Array from comma/space-separated string (no extra escaping) json_list() { \ ret=""; IFS="${IFS},"; set -- $*; \ for a in "$@"; do \ ret=$(printf '%s"%s"' "${ret}${ret:+, }" "$a"); \ done; \ printf '[%s]' "$ret"; } tests="${{ inputs.test-set || env.test-set }}" [ $tests = both ] && tests="core download" printf 'test-set=%s\n' "$(json_list $tests)" >> "$GITHUB_OUTPUT" versions="${{ inputs.cpython-versions || env.cpython-versions }}" if [ "$versions" = all ]; then \ versions="${{ env.all-cpython-versions }}"; else \ versions="${{ env.main-cpython-versions }}"; \ fi printf 'cpython-versions=%s\n' \ "$(json_list ${versions}${versions:+, }${{ env.pypy-versions }})" >> "$GITHUB_OUTPUT" # versions with a special get-pip.py in a per-version subdirectory printf 'own-pip-versions=%s\n' \ "$(json_list 2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6)" >> "$GITHUB_OUTPUT" tests: name: Run tests needs: select permissions: contents: read packages: write runs-on: ${{ matrix.os }} env: PIP: python -m pip PIP_DISABLE_PIP_VERSION_CHECK: true PIP_NO_PYTHON_VERSION_WARNING: true strategy: fail-fast: true matrix: os: [ubuntu-22.04] python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }} python-impl: [cpython] ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }} run-tests-ext: [sh] include: - os: windows-2022 python-version: 3.4 python-impl: cpython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} run-tests-ext: bat - os: windows-2022 python-version: 3.4 python-impl: cpython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} run-tests-ext: bat # jython - os: ubuntu-22.04 python-version: 2.7 python-impl: jython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} run-tests-ext: sh - os: ubuntu-22.04 python-version: 2.7 python-impl: jython ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} run-tests-ext: sh steps: - name: Prepare Linux if: ${{ startswith(matrix.os, 'ubuntu') }} shell: bash run: | # apt in runner, if needed, may not be up-to-date sudo apt-get update - name: Checkout uses: actions/checkout@v3 #-------- Python 3 ----- - name: Set up supported Python ${{ matrix.python-version }} id: setup-python if: ${{ matrix.python-impl == 'cpython' && matrix.python-version != '2.6' && matrix.python-version != '2.7' && matrix.python-version != env.next }} # wrap broken actions/setup-python@v4 # NB may run apt-get install in Linux uses: ytdl-org/setup-python@v1 env: # Temporary (?) workaround for Python 3.5 failures - May 2024 PIP_TRUSTED_HOST: "pypi.python.org pypi.org files.pythonhosted.org" with: python-version: ${{ matrix.python-version }} cache-build: true allow-build: info - name: Locate supported Python ${{ matrix.python-version }} if: ${{ env.pythonLocation }} shell: bash run: | echo "PYTHONHOME=${pythonLocation}" >> "$GITHUB_ENV" export expected="${{ steps.setup-python.outputs.python-path }}" dirname() { printf '%s\n' \ 'import os, sys' \ 'print(os.path.dirname(sys.argv[1]))' \ | ${expected} - "$1"; } expd="$(dirname "$expected")" export python="$(command -v python)" [ "$expd" = "$(dirname "$python")" ] || echo "PATH=$expd:${PATH}" >> "$GITHUB_ENV" [ -x "$python" ] || printf '%s\n' \ 'import os' \ 'exp = os.environ["expected"]' \ 'python = os.environ["python"]' \ 'exps = os.path.split(exp)' \ 'if python and (os.path.dirname(python) == exp[0]):' \ ' exit(0)' \ 'exps[1] = "python" + os.path.splitext(exps[1])[1]' \ 'python = os.path.join(*exps)' \ 'try:' \ ' os.symlink(exp, python)' \ 'except AttributeError:' \ ' os.rename(exp, python)' \ | ${expected} - printf '%s\n' \ 'import sys' \ 'print(sys.path)' \ | ${expected} - #-------- Python next (was 3.12) - - name: Set up CPython 3.next environment if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }} shell: bash run: | PYENV_ROOT=$HOME/.local/share/pyenv echo "PYENV_ROOT=${PYENV_ROOT}" >> "$GITHUB_ENV" - name: Cache Python 3.next id: cachenext if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }} uses: actions/cache@v3 with: key: python-${{ env.next }} path: | ${{ env.PYENV_ROOT }} - name: Build and set up Python 3.next if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next && ! steps.cachenext.outputs.cache-hit }} # dl and build locally shell: bash run: | # Install build environment sudo apt-get install -y build-essential llvm libssl-dev tk-dev \ libncursesw5-dev libreadline-dev libsqlite3-dev \ libffi-dev xz-utils zlib1g-dev libbz2-dev liblzma-dev # Download PyEnv from its GitHub repository. export PYENV_ROOT=${{ env.PYENV_ROOT }} export PATH=$PYENV_ROOT/bin:$PATH git clone "https://github.com/pyenv/pyenv.git" "$PYENV_ROOT" pyenv install ${{ env.next }} - name: Locate Python 3.next if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == env.next }} shell: bash run: | PYTHONHOME="$(echo "${{ env.PYENV_ROOT }}/versions/${{ env.next }}."*)" test -n "$PYTHONHOME" echo "PYTHONHOME=$PYTHONHOME" >> "$GITHUB_ENV" echo "PATH=${PYTHONHOME}/bin:$PATH" >> "$GITHUB_ENV" #-------- Python 2.7 -- - name: Set up Python 2.7 if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.7' }} # install 2.7 shell: bash run: | # Ubuntu 22.04 no longer has python-is-python2: fetch it curl -L "http://launchpadlibrarian.net/474693132/python-is-python2_2.7.17-4_all.deb" -o python-is-python2.deb sudo apt-get install -y python2 sudo dpkg --force-breaks -i python-is-python2.deb echo "PYTHONHOME=/usr" >> "$GITHUB_ENV" #-------- Python 2.6 -- - name: Set up Python 2.6 environment if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' }} shell: bash run: | openssl_name=openssl-1.0.2u echo "openssl_name=${openssl_name}" >> "$GITHUB_ENV" openssl_dir=$HOME/.local/opt/$openssl_name echo "openssl_dir=${openssl_dir}" >> "$GITHUB_ENV" PYENV_ROOT=$HOME/.local/share/pyenv echo "PYENV_ROOT=${PYENV_ROOT}" >> "$GITHUB_ENV" sudo apt-get install -y openssl ca-certificates - name: Cache Python 2.6 id: cache26 if: ${{ matrix.python-version == '2.6' }} uses: actions/cache@v3 with: key: python-2.6.9 path: | ${{ env.openssl_dir }} ${{ env.PYENV_ROOT }} - name: Build and set up Python 2.6 if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' && ! steps.cache26.outputs.cache-hit }} # dl and build locally shell: bash run: | # Install build environment sudo apt-get install -y build-essential llvm libssl-dev tk-dev \ libncursesw5-dev libreadline-dev libsqlite3-dev \ libffi-dev xz-utils zlib1g-dev libbz2-dev liblzma-dev # Download and install OpenSSL 1.0.2, back in time openssl_name=${{ env.openssl_name }} openssl_targz=${openssl_name}.tar.gz openssl_dir=${{ env.openssl_dir }} openssl_inc=$openssl_dir/include openssl_lib=$openssl_dir/lib openssl_ssl=$openssl_dir/ssl curl -L "https://www.openssl.org/source/$openssl_targz" -o $openssl_targz tar -xf $openssl_targz ( cd $openssl_name; \ ./config --prefix=$openssl_dir --openssldir=${openssl_dir}/ssl \ --libdir=lib -Wl,-rpath=${openssl_dir}/lib shared zlib-dynamic && \ make && \ make install ) rm -rf $openssl_name rmdir $openssl_ssl/certs && ln -s /etc/ssl/certs $openssl_ssl/certs # Download PyEnv from its GitHub repository. export PYENV_ROOT=${{ env.PYENV_ROOT }} export PATH=$PYENV_ROOT/bin:$PATH git clone "https://github.com/pyenv/pyenv.git" "$PYENV_ROOT" # Prevent pyenv build trying (and failing) to update pip export GET_PIP=get-pip-2.6.py echo 'import sys; sys.exit(0)' > ${GET_PIP} GET_PIP=$(realpath $GET_PIP) # Build and install Python export CFLAGS="-I$openssl_inc" export LDFLAGS="-L$openssl_lib" export LD_LIBRARY_PATH="$openssl_lib" pyenv install 2.6.9 - name: Locate Python 2.6 if: ${{ matrix.python-impl == 'cpython' && matrix.python-version == '2.6' }} shell: bash run: | PYTHONHOME="${{ env.PYENV_ROOT }}/versions/2.6.9" echo "PYTHONHOME=$PYTHONHOME" >> "$GITHUB_ENV" echo "PATH=${PYTHONHOME}/bin:$PATH" >> "$GITHUB_ENV" echo "LD_LIBRARY_PATH=${{ env.openssl_dir }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH}" >> "$GITHUB_ENV" #-------- Jython ------ - name: Set up Java 8 if: ${{ matrix.python-impl == 'jython' }} uses: actions/setup-java@v3 with: java-version: 8 distribution: 'zulu' - name: Setup Jython environment if: ${{ matrix.python-impl == 'jython' }} shell: bash run: | echo "JYTHON_ROOT=${HOME}/jython" >> "$GITHUB_ENV" echo "PIP=pip" >> "$GITHUB_ENV" - name: Cache Jython id: cachejy if: ${{ matrix.python-impl == 'jython' && matrix.python-version == '2.7' }} uses: actions/cache@v3 with: # 2.7.3 now available, may solve SNI issue key: jython-2.7.1 path: | ${{ env.JYTHON_ROOT }} - name: Install Jython if: ${{ matrix.python-impl == 'jython' && matrix.python-version == '2.7' && ! steps.cachejy.outputs.cache-hit }} shell: bash run: | JYTHON_ROOT="${{ env.JYTHON_ROOT }}" curl -L "https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar" -o jython-installer.jar java -jar jython-installer.jar -s -d "${JYTHON_ROOT}" echo "${JYTHON_ROOT}/bin" >> "$GITHUB_PATH" - name: Set up cached Jython if: ${{ steps.cachejy.outputs.cache-hit }} shell: bash run: | JYTHON_ROOT="${{ env.JYTHON_ROOT }}" echo "${JYTHON_ROOT}/bin" >> $GITHUB_PATH - name: Install supporting Python 2.7 if possible if: ${{ steps.cachejy.outputs.cache-hit }} shell: bash run: | sudo apt-get install -y python2.7 || true #-------- pip --------- - name: Set up supported Python ${{ matrix.python-version }} pip if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || matrix.python-version == '2.7' }} # This step may run in either Linux or Windows shell: bash run: | echo "$PATH" echo "$PYTHONHOME" # curl is available on both Windows and Linux, -L follows redirects, -O gets name python -m ensurepip || python -m pip --version || { \ get_pip="${{ contains(needs.select.outputs.own-pip-versions, matrix.python-version) && format('{0}/', matrix.python-version) || '' }}"; \ curl -L -O "https://bootstrap.pypa.io/pip/${get_pip}get-pip.py"; \ python get-pip.py --no-setuptools --no-wheel; } - name: Set up Python 2.6 pip if: ${{ matrix.python-version == '2.6' }} shell: bash run: | python -m pip --version || { \ curl -L -O "https://bootstrap.pypa.io/pip/2.6/get-pip.py"; \ curl -L -O "https://files.pythonhosted.org/packages/ac/95/a05b56bb975efa78d3557efa36acaf9cf5d2fd0ee0062060493687432e03/pip-9.0.3-py2.py3-none-any.whl"; \ python get-pip.py --no-setuptools --no-wheel pip-9.0.3-py2.py3-none-any.whl; } # work-around to invoke pip module on 2.6: https://bugs.python.org/issue2751 echo "PIP=python -m pip.__main__" >> "$GITHUB_ENV" - name: Set up other Python ${{ matrix.python-version }} pip if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} shell: bash run: | python -m pip --version || { \ curl -L -O "https://bootstrap.pypa.io/pip/3.2/get-pip.py"; \ curl -L -O "https://files.pythonhosted.org/packages/b2/d0/cd115fe345dd6f07ec1c780020a7dfe74966fceeb171e0f20d1d4905b0b7/pip-7.1.2-py2.py3-none-any.whl"; \ python get-pip.py --no-setuptools --no-wheel pip-7.1.2-py2.py3-none-any.whl; } #-------- unittest ---- - name: Upgrade Unittest for Python 2.6 if: ${{ matrix.python-version == '2.6' }} shell: bash run: | # Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) $PIP -qq show unittest2 || { \ for u in "65/26/32b8464df2a97e6dd1b656ed26b2c194606c16fe163c695a992b36c11cdf/six-1.13.0-py2.py3-none-any.whl" \ "f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl" \ "c7/a3/c5da2a44c85bfbb6eebcfc1dde24933f8704441b98fdde6528f4831757a6/linecache2-1.0.0-py2.py3-none-any.whl" \ "17/0a/6ac05a3723017a967193456a2efa0aa9ac4b51456891af1e2353bb9de21e/traceback2-1.4.0-py2.py3-none-any.whl" \ "72/20/7f0f433060a962200b7272b8c12ba90ef5b903e218174301d0abfd523813/unittest2-1.1.0-py2.py3-none-any.whl"; do \ curl -L -O "https://files.pythonhosted.org/packages/${u}"; \ $PIP install ${u##*/}; \ done; } # make tests use unittest2 for test in ./test/test_*.py ./test/helper.py; do sed -r -i -e '/^import unittest$/s/test/test2 as unittest/' "$test" done #-------- nose -------- - name: Install nose for Python ${{ matrix.python-version }} if: ${{ (matrix.python-version != '3.2' && steps.setup-python.outputs.python-path) || (matrix.python-impl == 'cpython' && (matrix.python-version == '2.7' || matrix.python-version == env.next)) }} shell: bash run: | echo "$PATH" echo "$PYTHONHOME" # Use PyNose for recent Pythons instead of Nose py3ver="${{ matrix.python-version }}" py3ver=${py3ver#3.} [ "$py3ver" != "${{ matrix.python-version }}" ] && py3ver=${py3ver%.*} || py3ver=0 [ "$py3ver" -ge 9 ] && nose=pynose || nose=nose $PIP -qq show $nose || $PIP install $nose - name: Install nose for other Python 2 if: ${{ matrix.python-impl == 'jython' || (matrix.python-impl == 'cpython' && matrix.python-version == '2.6') }} shell: bash run: | # Work around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) $PIP -qq show nose || { \ curl -L -O "https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl"; \ $PIP install nose-1.3.7-py2-none-any.whl; } - name: Install nose for other Python 3 if: ${{ matrix.python-version == '3.2' && steps.setup-python.outputs.python-path }} shell: bash run: | $PIP -qq show nose || { \ curl -L -O "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl"; \ $PIP install nose-1.3.7-py3-none-any.whl; } - name: Set up nosetest test if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }} shell: bash run: | # set PYTHON_VER PYTHON_VER=${{ matrix.python-version }} [ "${PYTHON_VER#*-}" != "$PYTHON_VER" ] || PYTHON_VER="${{ matrix.python-impl }}-${PYTHON_VER}" echo "PYTHON_VER=$PYTHON_VER" >> "$GITHUB_ENV" echo "PYTHON_IMPL=${{ matrix.python-impl }}" >> "$GITHUB_ENV" # define a test to validate the Python version used by nosetests printf '%s\n' \ 'from __future__ import unicode_literals' \ 'import sys, os, platform' \ 'try:' \ ' import unittest2 as unittest' \ 'except ImportError:' \ ' import unittest' \ 'class TestPython(unittest.TestCase):' \ ' def setUp(self):' \ ' self.ver = os.environ["PYTHON_VER"].split("-")' \ ' def test_python_ver(self):' \ ' self.assertEqual(["%d" % v for v in sys.version_info[:2]], self.ver[-1].split(".")[:2])' \ ' self.assertTrue(sys.version.startswith(self.ver[-1]))' \ ' self.assertIn(self.ver[0], ",".join((sys.version, platform.python_implementation())).lower())' \ ' def test_python_impl(self):' \ ' self.assertIn(platform.python_implementation().lower(), (os.environ["PYTHON_IMPL"], self.ver[0]))' \ > test/test_python.py #-------- TESTS ------- - name: Run tests if: ${{ contains(needs.select.outputs.test-set, matrix.ytdl-test-set ) }} continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} run: | ./devscripts/run_tests.${{ matrix.run-tests-ext }} flake8: name: Linter runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install flake8 run: pip install flake8 - name: Run flake8 run: flake8 . ================================================ FILE: .gitignore ================================================ *.pyc *.pyo *.class *~ *.DS_Store wine-py2exe/ py2exe.log *.kate-swp build/ dist/ MANIFEST README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.fish youtube_dl/extractor/lazy_extractors.py youtube-dl youtube-dl.exe youtube-dl.tar.gz .coverage cover/ updates_key.pem *.egg-info *.srt *.ttml *.sbv *.vtt *.flv *.mp4 *.m4a *.m4v *.mp3 *.3gp *.wav *.ape *.mkv *.swf *.part *.ytdl *.swp test/local_parameters.json .tox youtube-dl.zsh # IntelliJ related files .idea *.iml tmp/ venv/ # VS Code related files .vscode ================================================ FILE: AUTHORS ================================================ Ricardo Garcia Gonzalez Danny Colligan Benjamin Johnson Vasyl' Vavrychuk Witold Baryluk Paweł Paprota Gergely Imreh Rogério Brito Philipp Hagemeister Sören Schulze Kevin Ngo Ori Avtalion shizeeg Filippo Valsorda Christian Albrecht Dave Vasilevsky Jaime Marquínez Ferrándiz Jeff Crouse Osama Khalid Michael Walter M. Yasoob Ullah Khalid Julien Fraichard Johny Mo Swag Axel Noack Albert Kim Pierre Rudloff Huarong Huo Ismael Mejía Steffan Donal Andras Elso Jelle van der Waa Marcin Cieślak Anton Larionov Takuya Tsuchida Sergey M. Michael Orlitzky Chris Gahan Saimadhav Heblikar Mike Col Oleg Prutz pulpe Andreas Schmitz Michael Kaiser Niklas Laxström David Triendl Anthony Weems David Wagner Juan C. Olivares Mattias Harrysson phaer Sainyam Kapoor Nicolas Évrard Jason Normore Hoje Lee Adam Thalhammer Georg Jähnig Ralf Haring Koki Takahashi Ariset Llerena Adam Malcontenti-Wilson Tobias Bell Naglis Jonaitis Charles Chen Hassaan Ali Dobrosław Żybort David Fabijan Sebastian Haas Alexander Kirk Erik Johnson Keith Beckman Ole Ernst Aaron McDaniel (mcd1992) Magnus Kolstad Hari Padmanaban Carlos Ramos 5moufl lenaten Dennis Scheiba Damon Timm winwon Xavier Beynon Gabriel Schubiner xantares Jan Matějka Mauroy Sébastien William Sewell Dao Hoang Son Oskar Jauch Matthew Rayfield t0mm0 Tithen-Firion Zack Fernandes cryptonaut Adrian Kretz Mathias Rav Petr Kutalek Will Glynn Max Reimann Cédric Luthi Thijs Vermeir Joel Leclerc Christopher Krooss Ondřej Caletka Dinesh S Johan K. Jensen Yen Chi Hsuan Enam Mijbah Noor David Luhmer Shaya Goldberg Paul Hartmann Frans de Jonge Robin de Rooij Ryan Schmidt Leslie P. Polzer Duncan Keall Alexander Mamay Devin J. Pohly Eduardo Ferro Aldama Jeff Buchbinder Amish Bhadeshia Joram Schrijver Will W. Mohammad Teimori Pabandi Roman Le Négrate Matthias Küch Julian Richen Ping O. Mister Hat Peter Ding jackyzy823 George Brighton Remita Amine Aurélio A. Heckert Bernhard Minks sceext Zach Bruggeman Tjark Saul slangangular Behrouz Abbasi ngld nyuszika7h Shaun Walbridge Lee Jenkins Anssi Hannula Lukáš Lalinský Qijiang Fan Rémy Léone Marco Ferragina reiv Muratcan Simsek Evan Lu flatgreen Brian Foley Vignesh Venkat Tom Gijselinck Founder Fang Andrew Alexeyew Saso Bezlaj Erwin de Haan Jens Wille Robin Houtevelts Patrick Griffis Aidan Rowe mutantmonkey Ben Congdon Kacper Michajłow José Joaquín Atria Viťas Strádal Kagami Hiiragi Philip Huppert blahgeek Kevin Deldycke inondle Tomáš Čech Déstin Reed Roman Tsiupa Artur Krysiak Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel Rob van Bekkum Petr Zvoníček Pratyush Singh Aleksander Nitecki Sebastian Blunt Matěj Cepl Xie Yanbo Philip Xu John Hawkinson Rich Leeper Zhong Jianxin Thor77 Mattias Wadman Arjan Verwer Costy Petrisor Logan B Alex Seiler Vijay Singh Paul Hartmann Stephen Chen Fabian Stahl Bagira Odd Stråbø Philip Herzog Thomas Christlieb Marek Rusinowski Tobias Gruetzmacher Olivier Bilodeau Lars Vierbergen Juanjo Benages Xiao Di Guan Thomas Winant Daniel Twardowski Jeremie Jarosh Gerard Rovira Marvin Ewald Frédéric Bournival Timendum gritstub Adam Voss Mike Fährmann Jan Kundrát Giuseppe Fabiano Örn Guðjónsson Parmjit Virk Genki Sky Ľuboš Katrinec Corey Nicholson Ashutosh Chaudhary John Dong Tatsuyuki Ishi Daniel Weber Kay Bouché Yang Hongbo Lei Wang Petr Novák Leonardo Taccari Martin Weinelt Surya Oktafendri TingPing Alexandre Macabies Bastian de Groot Niklas Haas András Veres-Szentkirályi Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin Adrian Heine ================================================ FILE: CONTRIBUTING.md ================================================ **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ youtube-dl -v [debug] System config: [] [debug] User config: [] [debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e [debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} ... ``` **Do not post screenshots of verbose logs; only plain text is acceptable.** The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): ### Is the description of the issue itself sufficient? We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious - What the problem is - How it could be fixed - How your proposed solution would look like If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. ### Is the issue already documented? Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. ### Why are existing options not enough? Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? People want to solve problems, and often think they do us a favor by breaking down their larger problems (e.g. wanting to skip already downloaded files) to a specific request (e.g. requesting us to look whether the file exists before downloading the info page). However, what often happens is that they break down the problem into two steps: One simple, and one impossible (or extremely complicated one). We are then presented with a very complicated request when the original problem could be solved far easier, e.g. by recording the downloaded video IDs in a separate file. To avoid this, you must include the greater context where it is non-obvious. In particular, every feature request that does not consist of adding support for a new site should contain a use case scenario that explains in what situation the missing feature would be useful. ### Does the issue involve one problem, and one problem only? Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, White house podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. ### Is anyone going to need the feature? Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. ### Is your question about youtube-dl? It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different, or even the reporter's own, application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. # DEVELOPER INSTRUCTIONS Most users do not need to build youtube-dl and can [download the builds](https://ytdl-org.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute python -m youtube_dl To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: python -m unittest discover python test/test_download.py nosetests See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. If you want to create a build of youtube-dl yourself, you'll need * python * make (only GNU make is supported) * pandoc * zip * nosetests ### Adding support for a new site If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](README.md#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/ytdl-org/youtube-dl/fork) 2. Check out the source code with: git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git 3. Start a new git branch with cd youtube-dl git checkout -b yourextractor 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: ```python # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', 'ext': 'mp4', 'title': 'Video title goes here', 'thumbnail': r're:^https?://.*\.jpg$', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: # * A regular expression; start the string with re: # * Any Python type (for example int or float) } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). This makes the extractor available for use, as long as the class ends with `IE`. 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py 9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor 11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! ## youtube-dl coding conventions This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all. ### Mandatory and optional metafields For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) - `url` (media download URL) or `formats` In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. [Any field](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`: ```python meta = self._download_json(url, video_id) ``` Assume at this point `meta`'s layout is: ```python { ... "summary": "some fancy summary text", ... } ``` Assume you want to extract `summary` and put it into the resulting info dict as `description`. Since `description` is an optional meta field you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: ```python description = meta.get('summary') # correct ``` and not like: ```python description = meta['summary'] # incorrect ``` The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some later time but with the former approach extraction will just go ahead with `description` set to `None` which is perfectly fine (remember `None` is equivalent to the absence of data). Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: ```python description = self._search_regex( r']+id="title"[^>]*>([^<]+)<', webpage, 'description', fatal=False) ``` With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction. You can also pass `default=`, for example: ```python description = self._search_regex( r']+id="title"[^>]*>([^<]+)<', webpage, 'description', default=None) ``` On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present. ### Provide fallbacks When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable. #### Example Say `meta` from the previous example has a `title` and you are about to extract it. Since `title` is a mandatory meta field you should end up with something like: ```python title = meta['title'] ``` If `title` disappears from `meta` in future due to some changes on the hoster's side the extraction would fail since `title` is mandatory. That's expected. Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: ```python title = meta.get('title') or self._og_search_title(webpage) ``` This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. ### Regular expressions #### Don't capture groups you don't use Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. ##### Example Don't capture id attribute name here since you can't use it for anything anyway. Correct: ```python r'(?:id|ID)=(?P\d+)' ``` Incorrect: ```python r'(id|ID)=(?P\d+)' ``` #### Make regular expressions relaxed and flexible When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. ##### Example Say you need to extract `title` from the following HTML code: ```html some fancy title ``` The code for that task should look similar to: ```python title = self._search_regex( r']+class="title"[^>]*>([^<]+)', webpage, 'title') ``` Or even better: ```python title = self._search_regex( r']+class=(["\'])title\1[^>]*>(?P[^<]+)', webpage, 'title', group='title') ``` Note how you tolerate potential changes in the `style` attribute's value or switch from using double quotes to single for `class` attribute: The code definitely should not look like: ```python title = self._search_regex( r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>', webpage, 'title', group='title') ``` ### Long lines policy There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: Correct: ```python 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' ``` Incorrect: ```python 'https://www.youtube.com/watch?v=FqZTN594JQw&list=' 'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' ``` ### Inline values Extracting variables is acceptable for reducing code duplication and improving readability of complex expressions. However, you should avoid extracting variables used only once and moving them to opposite parts of the extractor file, which makes reading the linear flow difficult. #### Example Correct: ```python title = self._html_search_regex(r'<title>([^<]+)', webpage, 'title') ``` Incorrect: ```python TITLE_RE = r'([^<]+)' # ...some lines of code... title = self._html_search_regex(TITLE_RE, webpage, 'title') ``` ### Collapse fallbacks Multiple fallback values can quickly become unwieldy. Collapse multiple fallback values into a single expression via a list of patterns. #### Example Good: ```python description = self._html_search_meta( ['og:description', 'description', 'twitter:description'], webpage, 'description', default=None) ``` Unwieldy: ```python description = ( self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, default=None) or self._html_search_meta('twitter:description', webpage, default=None)) ``` Methods supporting list of patterns are: `_search_regex`, `_html_search_regex`, `_og_search_property`, `_html_search_meta`. ### Trailing parentheses Always move trailing parentheses after the last argument. #### Example Correct: ```python lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], list) ``` Incorrect: ```python lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], list, ) ``` ### Use convenience conversion and parsing functions Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. Use `url_or_none` for safe URL processing. Use `try_get` for safe metadata extraction from parsed JSON. Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. #### More examples ##### Safely extract optional description from parsed JSON ```python description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) ``` ##### Safely extract more optional metadata ```python video = try_get(response, lambda x: x['result']['video'][0], dict) or {} description = video.get('summary') duration = float_or_none(video.get('durationMs'), scale=1000) view_count = int_or_none(video.get('views')) ``` ================================================ FILE: ChangeLog ================================================ version 2021.12.17 Core * [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) Extractors * [youtube] Update signature function patterns (#30363, #30366) * [peertube] Only call description endpoint if necessary (#29383) * [periscope] Pass referer to HLS requests (#29419) - [liveleak] Remove extractor (#17625, #24222, #29331) + [pornhub] Add support for pornhubthbh7ap3u.onion * [pornhub] Detect geo restriction * [pornhub] Dismiss tbr extracted from download URLs (#28927) * [curiositystream:collection] Extend _VALID_URL (#26326, #29117) * [youtube] Make get_video_info processing more robust (#29333) * [youtube] Workaround for get_video_info request (#29333) * [bilibili] Strip uploader name (#29202) * [youtube] Update invidious instance list (#29281) * [umg:de] Update GraphQL API URL (#29304) * [nrk] Switch psapi URL to https (#29344) + [egghead] Add support for app.egghead.io (#28404, #29303) * [appleconnect] Fix extraction (#29208) + [orf:tvthek] Add support for MPD formats (#28672, #29236) version 2021.06.06 Extractors * [facebook] Improve login required detection * [youporn] Fix formats and view count extraction (#29216) * [orf:tvthek] Fix thumbnails extraction (#29217) * [formula1] Fix extraction (#29206) * [ard] Relax URL regular expression and fix video ids (#22724, #29091) + [ustream] Detect https embeds (#29133) * [ted] Prefer own formats over external sources (#29142) * [twitch:clips] Improve extraction (#29149) + [twitch:clips] Add access token query to download URLs (#29136) * [youtube] Fix get_video_info request (#29086, #29165) * [vimeo] Fix vimeo pro embed extraction (#29126) * [redbulltv] Fix embed data extraction (#28770) * [shahid] Relax URL regular expression (#28772, #28930) version 2021.05.16 Core * [options] Fix thumbnail option group name (#29042) * [YoutubeDL] Improve extract_info doc (#28946) Extractors + [playstuff] Add support for play.stuff.co.nz (#28901, #28931) * [eroprofile] Fix extraction (#23200, #23626, #29008) + [vivo] Add support for vivo.st (#29009) + [generic] Add support for og:audio (#28311, #29015) * [phoenix] Fix extraction (#29057) + [generic] Add support for sibnet embeds + [vk] Add support for sibnet embeds (#9500) + [generic] Add Referer header for direct videojs download URLs (#2879, #20217, #29053) * [orf:radio] Switch download URLs to HTTPS (#29012, #29046) - [blinkx] Remove extractor (#28941) * [medaltv] Relax URL regular expression (#28884) + [funimation] Add support for optional lang code in URLs (#28950) + [gdcvault] Add support for HTML5 videos * [dispeak] Improve FLV extraction (#13513, #28970) * [kaltura] Improve iframe extraction (#28969) * [kaltura] Make embed code alternatives actually work * [cda] Improve extraction (#28709, #28937) * [twitter] Improve formats extraction from vmap URL (#28909) * [xtube] Fix formats extraction (#28870) * [svtplay] Improve extraction (#28507, #28876) * [tv2dk] Fix extraction (#28888) version 2021.04.26 Extractors + [xfileshare] Add support for wolfstream.tv (#28858) * [francetvinfo] Improve video id extraction (#28792) * [medaltv] Fix extraction (#28807) * [tver] Redirect all downloads to Brightcove (#28849) * [go] Improve video id extraction (#25207, #25216, #26058) * [youtube] Fix lazy extractors (#28780) + [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) * [cbsnews] Fix extraction for python <3.6 (#23359) version 2021.04.17 Core + [utils] Add support for experimental HTTP response status code 308 Permanent Redirect (#27877, #28768) Extractors + [lbry] Add support for HLS videos (#27877, #28768) * [youtube] Fix stretched ratio calculation * [youtube] Improve stretch extraction (#28769) * [youtube:tab] Improve grid extraction (#28725) + [youtube:tab] Detect series playlist on playlists page (#28723) + [youtube] Add more invidious instances (#28706) * [pluralsight] Extend anti-throttling timeout (#28712) * [youtube] Improve URL to extractor routing (#27572, #28335, #28742) + [maoritv] Add support for maoritelevision.com (#24552) + [youtube:tab] Pass innertube context and x-goog-visitor-id header along with continuation requests (#28702) * [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) + [pornhub] Extract DASH and HLS formats from get_media end point (#28698) * [cbssports] Fix extraction (#28682) * [jamendo] Fix track extraction (#28686) * [curiositystream] Fix format extraction (#26845, #28668) version 2021.04.07 Core * [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies + [compat] Introduce compat_cookies_SimpleCookie * [extractor/common] Improve JSON-LD author extraction * [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, #28640) Extractors * [youtube] Fix extraction of videos with restricted location (#28685) + [line] Add support for live.line.me (#17205, #28658) * [vimeo] Improve extraction (#28591) * [youku] Update ccode (#17852, #28447, #28460, #28648) * [youtube] Prefer direct entry metadata over entry metadata from playlist (#28619, #28636) * [screencastomatic] Fix extraction (#11976, #24489) + [palcomp3] Add support for palcomp3.com (#13120) + [arnes] Add support for video.arnes.si (#28483) + [youtube:tab] Add support for hashtags (#28308) version 2021.04.01 Extractors * [youtube] Setup CONSENT cookie when needed (#28604) * [vimeo] Fix password protected review extraction (#27591) * [youtube] Improve age-restricted video extraction (#28578) version 2021.03.31 Extractors * [vlive] Fix inkey request (#28589) * [francetvinfo] Improve video id extraction (#28584) + [instagram] Extract duration (#28469) * [instagram] Improve title extraction (#28469) + [sbs] Add support for ondemand watch URLs (#28566) * [youtube] Fix video's channel extraction (#28562) * [picarto] Fix live stream extraction (#28532) * [vimeo] Fix unlisted video extraction (#28414) * [youtube:tab] Fix playlist/community continuation items extraction (#28266) * [ard] Improve clip id extraction (#22724, #28528) version 2021.03.25 Extractors + [zoom] Add support for zoom.us (#16597, #27002, #28531) * [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) * [youtube] Fix default value for youtube_include_dash_manifest (#28523) * [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) + [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) + [tiktok] Detect private videos (#28453) * [vimeo:album] Fix extraction for albums with number of videos multiple to page size (#28486) * [vvvvid] Fix kenc format extraction (#28473) * [mlb] Fix video extraction (#21241) * [svtplay] Improve extraction (#28448) * [applepodcasts] Fix extraction (#28445) * [rtve] Improve extraction + Extract all formats * Fix RTVE Infantil extraction (#24851) + Extract is_live and series version 2021.03.14 Core + Introduce release_timestamp meta field (#28386) Extractors + [southpark] Add support for southparkstudios.com (#28413) * [southpark] Fix extraction (#26763, #28413) * [sportdeutschland] Fix extraction (#21856, #28425) * [pinterest] Reduce the number of HLS format requests * [peertube] Improve thumbnail extraction (#28419) * [tver] Improve title extraction (#28418) * [fujitv] Fix HLS formats extension (#28416) * [shahid] Fix format extraction (#28383) + [lbry] Add support for channel filters (#28385) + [bandcamp] Extract release timestamp + [lbry] Extract release timestamp (#28386) * [pornhub] Detect flagged videos + [pornhub] Extract formats from get_media end point (#28395) * [bilibili] Fix video info extraction (#28341) + [cbs] Add support for Paramount+ (#28342) + [trovo] Add Origin header to VOD formats (#28346) * [voxmedia] Fix volume embed extraction (#28338) version 2021.03.03 Extractors * [youtube:tab] Switch continuation to browse API (#28289, #28327) * [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) + [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) version 2021.03.02 Extractors * [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, #27930, #28198, #28199, #28274) * Generalize cross-extractor video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix * [stretchinternet] Fix extraction (#28297) * [urplay] Fix episode data extraction (#28292) + [bandaichannel] Add support for b-ch.com (#21404) * [srgssr] Improve extraction (#14717, #14725, #27231, #28238) + Extract subtitle * Fix extraction for new videos * Update srf download domains * [vvvvid] Reduce season request payload size + [vvvvid] Extract series sublists playlist title (#27601, #27618) + [dplay] Extract Ad-Free uplynk URLs (#28160) + [wat] Detect DRM protected videos (#27958) * [tf1] Improve extraction (#27980, #28040) * [tmz] Fix and improve extraction (#24603, #24687, 28211) + [gedidigital] Add support for Gedi group sites (#7347, #26946) * [youtube] Fix get_video_info request version 2021.02.22 Core + [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase (#28112) Extractors * [apa] Fix and improve extraction (#27750) + [youporn] Extract duration (#28019) + [peertube] Add support for canard.tube (#28190) * [youtube] Fixup m4a_dash formats (#28165) + [samplefocus] Add support for samplefocus.com (#27763) + [vimeo] Add support for unlisted video source format extraction * [viki] Improve extraction (#26522, #28203) * Extract uploader URL and episode number * Report login required error + Extract 480p formats * Fix API v4 calls * [ninegag] Unescape title (#28201) * [youtube] Improve URL regular expression (#28193) + [youtube] Add support for redirect.invidious.io (#28193) + [dplay] Add support for de.hgtv.com (#28182) + [dplay] Add support for discoveryplus.com (#24698) + [simplecast] Add support for simplecast.com (#24107) * [youtube] Fix uploader extraction in flat playlist mode (#28045) * [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) + [storyfire] Add support for storyfire.com (#25628, #26349) + [zhihu] Add support for zhihu.com (#28177) * [youtube] Fix controversial videos when authenticated with cookies (#28174) * [ccma] Fix timestamp parsing in python 2 + [videopress] Add support for video.wordpress.com * [kakao] Improve info extraction and detect geo restriction (#26577) * [xboxclips] Fix extraction (#27151) * [ard] Improve formats extraction (#28155) + [canvas] Add support for dagelijksekost.een.be (#28119) version 2021.02.10 Extractors * [youtube:tab] Improve grid continuation extraction (#28130) * [ign] Fix extraction (#24771) + [xhamster] Extract format filesize + [xhamster] Extract formats from xplayer settings (#28114) + [youtube] Add support phone/tablet JS player (#26424) * [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, #27109, #27236, #28063) + [cda] Detect geo restricted videos (#28106) * [urplay] Fix extraction (#28073, #28074) * [youtube] Fix release date extraction (#28094) + [youtube] Extract abr and vbr (#28100) * [youtube] Skip OTF formats (#28070) version 2021.02.04.1 Extractors * [youtube] Prefer DASH formats (#28070) * [azmedien] Fix extraction (#28064) version 2021.02.04 Extractors * [pornhub] Implement lazy playlist extraction * [svtplay] Fix video id extraction (#28058) + [pornhub] Add support for authentication (#18797, #21416, #24294) * [pornhub:user] Improve paging + [pornhub:user] Add support for URLs unavailable via /videos page (#27853) + [bravotv] Add support for oxygen.com (#13357, #22500) + [youtube] Pass embed URL to get_video_info request * [ccma] Improve metadata extraction (#27994) + Extract age limit, alt title, categories, series and episode number * Fix timestamp multiple subtitles extraction * [egghead] Update API domain (#28038) - [vidzi] Remove extractor (#12629) * [vidio] Improve metadata extraction * [youtube] Improve subtitles extraction * [youtube] Fix chapter extraction fallback * [youtube] Rewrite extractor * Improve format sorting * Remove unused code * Fix series metadata extraction * Fix trailer video extraction * Improve error reporting + Extract video location + [vvvvid] Add support for youtube embeds (#27825) * [googledrive] Report download page errors (#28005) * [vlive] Fix error message decoding for python 2 (#28004) * [youtube] Improve DASH formats file size extraction * [cda] Improve birth validation detection (#14022, #27929) + [awaan] Extract uploader id (#27963) + [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, #16106, #16489) * [abcnews] Fix extraction (#12394, #27920) * [AMP] Fix upload date and timestamp extraction (#27970) * [tv4] Relax URL regular expression (#27964) + [tv2] Add support for mtvuutiset.fi (#27744) * [adn] Improve login warning reporting * [zype] Fix uplynk id extraction (#27956) + [adn] Add support for authentication (#17091, #27841, #27937) version 2021.01.24.1 Core * Introduce --output-na-placeholder (#27896) Extractors * [franceculture] Make thumbnail optional (#18807) * [franceculture] Fix extraction (#27891, #27903) * [njpwworld] Fix extraction (#27890) * [comedycentral] Fix extraction (#27905) * [wat] Fix format extraction (#27901) + [americastestkitchen:season] Add support for seasons (#27861) + [trovo] Add support for trovo.live (#26125) + [aol] Add support for yahoo videos (#26650) * [yahoo] Fix single video extraction * [lbry] Unescape lbry URI (#27872) * [9gag] Fix and improve extraction (#23022) * [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) * [aljazeera] Fix extraction (#20911, #27779) + [minds] Add support for minds.com (#17934) * [ard] Fix title and description extraction (#27761) + [spotify] Add support for Spotify Podcasts (#27443) version 2021.01.16 Core * [YoutubeDL] Protect from infinite recursion due to recursively nested playlists (#27833) * [YoutubeDL] Ignore failure to create existing directory (#27811) * [YoutubeDL] Raise syntax error for format selection expressions with multiple + operators (#27803) Extractors + [animeondemand] Add support for lazy playlist extraction (#27829) * [youporn] Restrict fallback download URL (#27822) * [youporn] Improve height and tbr extraction (#20425, #23659) * [youporn] Fix extraction (#27822) + [twitter] Add support for unified cards (#27826) + [twitch] Add Authorization header with OAuth token for GraphQL requests (#27790) * [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) * [cspan] Improve info extraction (#27791) * [adn] Improve info extraction * [adn] Fix extraction (#26963, #27732) * [youtube:search] Extract from all sections (#27604) * [youtube:search] fix viewcount and try to extract all video sections (#27604) * [twitch] Improve login error extraction * [twitch] Fix authentication (#27743) * [3qsdn] Improve extraction (#21058) * [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) * [khanacademy] Fix extraction (#2887, #26803) * [spike] Update Paramount Network feed URL (#27715) version 2021.01.08 Core * [downloader/hls] Disable decryption in tests (#27660) + [utils] Add a function to clean podcast URLs Extractors * [rai] Improve subtitles extraction (#27698, #27705) * [canvas] Match only supported VRT NU URLs (#27707) + [bibeltv] Add support for bibeltv.de (#14361) + [bfmtv] Add support for bfmtv.com (#16053, #26615) + [sbs] Add support for ondemand play and news embed URLs (#17650, #27629) * [twitch] Drop legacy kraken API v5 code altogether and refactor * [twitch:vod] Switch to GraphQL for video metadata * [canvas] Fix VRT NU extraction (#26957, #27053) * [twitch] Switch access token to GraphQL and refactor (#27646) + [rai] Detect ContentItem in iframe (#12652, #27673) * [ketnet] Fix extraction (#27662) + [dplay] Add suport Discovery+ domains (#27680) * [motherless] Improve extraction (#26495, #27450) * [motherless] Fix recent videos upload date extraction (#27661) * [nrk] Fix extraction for videos without a legalAge rating - [googleplus] Remove extractor (#4955, #7400) + [applepodcasts] Add support for podcasts.apple.com (#25918) + [googlepodcasts] Add support for podcasts.google.com + [iheart] Add support for iheart.com (#27037) * [acast] Clean podcast URLs * [stitcher] Clean podcast URLs + [xfileshare] Add support for aparat.cam (#27651) + [twitter] Add support for summary card (#25121) * [twitter] Try to use a Generic fallback for unknown twitter cards (#25982) + [stitcher] Add support for shows and show metadata extraction (#20510) * [stv] Improve episode id extraction (#23083) version 2021.01.03 Extractors * [nrk] Improve series metadata extraction (#27473) + [nrk] Extract subtitles * [nrk] Fix age limit extraction * [nrk] Improve video id extraction + [nrk] Add support for podcasts (#27634, #27635) * [nrk] Generalize and delegate all item extractors to nrk + [nrk] Add support for mp3 formats * [nrktv] Switch to playback endpoint * [vvvvid] Fix season metadata extraction (#18130) * [stitcher] Fix extraction (#20811, #27606) * [acast] Fix extraction (#21444, #27612, #27613) + [arcpublishing] Add support for arcpublishing.com (#2298, #9340, #17200) + [sky] Add support for Sports News articles and Brighcove videos (#13054) + [vvvvid] Extract akamai formats * [vvvvid] Skip unplayable episodes (#27599) * [yandexvideo] Fix extraction for Python 3.4 version 2020.12.31 Core * [utils] Accept only supported protocols in url_or_none * [YoutubeDL] Allow format filtering using audio language (#16209) Extractors + [redditr] Extract all thumbnails (#27503) * [vvvvid] Improve info extraction + [vvvvid] Add support for playlists (#18130, #27574) + [yandexdisk] Extract info from webpage * [yandexdisk] Fix extraction (#17861, #27131) * [yandexvideo] Use old API call as fallback * [yandexvideo] Fix extraction (#25000) - [nbc] Remove CSNNE extractor * [nbc] Fix NBCSport VPlayer URL extraction (#16640) + [aenetworks] Add support for biography.com (#3863) * [uktvplay] Match new video URLs (#17909) * [sevenplay] Detect API errors * [tenplay] Fix format extraction (#26653) * [brightcove] Raise error for DRM protected videos (#23467, #27568) version 2020.12.29 Extractors * [youtube] Improve yt initial data extraction (#27524) * [youtube:tab] Improve URL matching #27559) * [youtube:tab] Restore retry on browse requests (#27313, #27564) * [aparat] Fix extraction (#22285, #22611, #23348, #24354, #24591, #24904, #25418, #26070, #26350, #26738, #27563) - [brightcove] Remove sonyliv specific code * [piksel] Improve format extraction + [zype] Add support for uplynk videos + [toggle] Add support for live.mewatch.sg (#27555) + [go] Add support for fxnow.fxnetworks.com (#13972, #22467, #23754, #26826) * [teachable] Improve embed detection (#26923) * [mitele] Fix free video extraction (#24624, #25827, #26757) * [telecinco] Fix extraction * [youtube] Update invidious.snopyta.org (#22667) * [amcnetworks] Improve auth only video detection (#27548) + [generic] Add support for VHX Embeds (#27546) version 2020.12.26 Extractors * [instagram] Fix comment count extraction + [instagram] Add support for reel URLs (#26234, #26250) * [bbc] Switch to media selector v6 (#23232, #23933, #26303, #26432, #26821, #27538) * [instagram] Improve thumbnail extraction * [instagram] Fix extraction when authenticated (#22880, #26377, #26981, #27422) * [spankbang:playlist] Fix extraction (#24087) + [spankbang] Add support for playlist videos * [pornhub] Improve like and dislike count extraction (#27356) * [pornhub] Fix lq formats extraction (#27386, #27393) + [bongacams] Add support for bongacams.com (#27440) * [youtube:tab] Extend URL regular expression (#27501) * [theweatherchannel] Fix extraction (#25930, #26051) + [sprout] Add support for Universal Kids (#22518) * [theplatform] Allow passing geo bypass countries from other extractors + [wistia] Add support for playlists (#27533) + [ctv] Add support for ctv.ca (#27525) * [9c9media] Improve info extraction * [youtube] Fix automatic captions extraction (#27162, #27388) * [sonyliv] Fix title for movies * [sonyliv] Fix extraction (#25667) * [streetvoice] Fix extraction (#27455, #27492) + [facebook] Add support for watchparty pages (#27507) * [cbslocal] Fix video extraction + [brightcove] Add another method to extract policyKey * [mewatch] Relax URL regular expression (#27506) version 2020.12.22 Core * [common] Remove unwanted query params from unsigned akamai manifest URLs Extractors - [tastytrade] Remove extractor (#25716) * [niconico] Fix playlist extraction (#27428) - [everyonesmixtape] Remove extractor - [kanalplay] Remove extractor * [arkena] Fix extraction * [nba] Rewrite extractor * [turner] Improve info extraction * [youtube] Improve xsrf token extraction (#27442) * [generic] Improve RSS age limit extraction * [generic] Fix RSS itunes thumbnail extraction (#27405) + [redditr] Extract duration (#27426) - [zaq1] Remove extractor + [asiancrush] Add support for retrocrush.tv * [asiancrush] Fix extraction - [noco] Remove extractor (#10864) * [nfl] Fix extraction (#22245) * [skysports] Relax URL regular expression (#27435) + [tv5unis] Add support for tv5unis.ca (#22399, #24890) + [videomore] Add support for more.tv (#27088) + [yandexmusic] Add support for music.yandex.com (#27425) + [nhk:program] Add support for audio programs and program clips + [nhk] Add support for NHK video programs (#27230) version 2020.12.14 Core * [extractor/common] Improve JSON-LD interaction statistic extraction (#23306) * [downloader/hls] Delegate manifests with media initialization to ffmpeg + [extractor/common] Document duration meta field for playlists Extractors * [mdr] Bypass geo restriction * [mdr] Improve extraction (#24346, #26873) * [yandexmusic:album] Improve album title extraction (#27418) * [eporner] Fix view count extraction and make optional (#23306) + [eporner] Extend URL regular expression * [eporner] Fix hash extraction and extend _VALID_URL (#27396) * [slideslive] Use m3u8 entry protocol for m3u8 formats (#27400) * [twitcasting] Fix format extraction and improve info extraction (#24868) * [linuxacademy] Fix authentication and extraction (#21129, #26223, #27402) * [itv] Clean description from HTML tags (#27399) * [vlive] Sort live formats (#27404) * [hotstart] Fix and improve extraction * Fix format extraction (#26690) + Extract thumbnail URL (#16079, #20412) + Add support for country specific playlist URLs (#23496) * Select the last id in video URL (#26412) + [youtube] Add some invidious instances (#27373) version 2020.12.12 Core * [YoutubeDL] Improve thumbnail filename deducing (#26010, #27244) Extractors + [ruutu] Extract more metadata + [ruutu] Detect non-free videos (#21154) * [ruutu] Authenticate format URLs (#21031, #26782) + [ruutu] Add support for static.nelonenmedia.fi (#25412) + [ruutu] Extend URL regular expression (#24839) + [facebook] Add support archived live video URLs (#15859) * [wdr] Improve overall extraction + [wdr] Extend subtitles extraction (#22672, #22723) + [facebook] Add support for videos attached to Relay based story pages (#10795) + [wdr:page] Add support for kinder.wdr.de (#27350) + [facebook] Add another regular expression for handleServerJS * [facebook] Fix embed page extraction + [facebook] Add support for Relay post pages (#26935) + [facebook] Add support for watch videos (#22795, #27062) + [facebook] Add support for group posts with multiple videos (#19131) * [itv] Fix series metadata extraction (#26897) - [itv] Remove old extraction method (#23177) * [facebook] Redirect mobile URLs to desktop URLs (#24831, #25624) + [facebook] Add support for Relay based pages (#26823) * [facebook] Try to reduce unnecessary tahoe requests - [facebook] Remove hardcoded Chrome User-Agent (#18974, #25411, #26958, #27329) - [smotri] Remove extractor (#27358) - [beampro] Remove extractor (#17290, #22871, #23020, #23061, #26099) version 2020.12.09 Core * [extractor/common] Fix inline HTML5 media tags processing (#27345) Extractors * [youtube:tab] Improve identity token extraction (#27197) * [youtube:tab] Make click tracking params on continuation optional * [youtube:tab] Delegate inline playlists to tab-based playlists (27298) + [tubitv] Extract release year (#27317) * [amcnetworks] Fix free content extraction (#20354) + [lbry:channel] Add support for channels (#25584) + [lbry] Add support for short and embed URLs * [lbry] Fix channel metadata extraction + [telequebec] Add support for video.telequebec.tv (#27339) * [telequebec] Fix extraction (#25733, #26883) + [youtube:tab] Capture and output alerts (#27340) * [tvplay:home] Fix extraction (#21153) * [americastestkitchen] Fix Extraction and add support for Cook's Country and Cook's Illustrated (#17234, #27322) + [slideslive] Add support for yoda service videos and extract subtitles (#27323) version 2020.12.07 Core * [extractor/common] Extract timestamp from Last-Modified header + [extractor/common] Add support for dl8-* media tags (#27283) * [extractor/common] Fix media type extraction for HTML5 media tags in start/end form Extractors * [aenetworks] Fix extraction (#23363, #23390, #26795, #26985) * Fix Fastly format extraction + Add support for play and watch subdomains + Extract series metadata * [youtube] Improve youtu.be extraction in non-existing playlists (#27324) + [generic] Extract RSS video description, timestamp and itunes metadata (#27177) * [nrk] Reduce the number of instalments and episodes requests * [nrk] Improve extraction * Improve format extraction for old akamai formats + Add is_live value to entry info dict * Request instalments only when available * Fix skole extraction + [peertube] Extract fps + [peertube] Recognize audio-only formats (#27295) version 2020.12.05 Core * [extractor/common] Improve Akamai HTTP format extraction * Allow m3u8 manifest without an additional audio format * Fix extraction for qualities starting with a number Extractors * [teachable:course] Improve extraction (#24507, #27286) * [nrk] Improve error extraction * [nrktv:series] Improve extraction (#21926) * [nrktv:season] Improve extraction * [nrk] Improve format extraction and geo-restriction detection (#24221) * [pornhub] Handle HTTP errors gracefully (#26414) * [nrktv] Relax URL regular expression (#27299, #26185) + [zdf] Extract webm formats (#26659) + [gamespot] Extract DASH and HTTP formats + [tver] Add support for tver.jp (#26662, #27284) + [pornhub] Add support for pornhub.org (#27276) version 2020.12.02 Extractors + [tva] Add support for qub.ca (#27235) + [toggle] Detect DRM protected videos (#16479, #20805) + [toggle] Add support for new MeWatch URLs (#27256) * [youtube:tab] Extract channels only from channels tab (#27266) + [cspan] Extract info from jwplayer data (#3672, #3734, #10638, #13030, #18806, #23148, #24461, #26171, #26800, #27263) * [cspan] Pass Referer header with format's video URL (#26032, #25729) * [youtube] Improve age-gated videos extraction (#27259) + [mediaset] Add support for movie URLs (#27240) * [yandexmusic] Refactor + [yandexmusic] Add support for artist's tracks and albums (#11887, #22284) * [yandexmusic:track] Fix extraction (#26449, #26669, #26747, #26748, #26762) version 2020.11.29 Core * [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug (#14579, #22593) Extractors * [drtv] Extend URL regular expression (#27243) * [tiktok] Fix extraction (#20809, #22838, #22850, #25987, #26281, #26411, #26639, #26776, #27237) + [ina] Add support for mobile URLs (#27229) * [pornhub] Fix like and dislike count extraction (#27227, #27234) * [youtube] Improve yt initial player response extraction (#27216) * [videa] Fix extraction (#25650, #25973, #26301) version 2020.11.26 Core * [downloader/fragment] Set final file's mtime according to last fragment's Last-Modified header (#11718, #18384, #27138) Extractors + [spreaker] Add support for spreaker.com (#13480, #13877) * [vlive] Improve extraction for geo-restricted videos + [vlive] Add support for post URLs (#27122, #27123) * [viki] Fix video API request (#27184) * [bbc] Fix BBC Three clip extraction * [bbc] Fix BBC News videos extraction + [medaltv] Add support for medal.tv (#27149) * [youtube] Improve music metadata and license extraction (#26013) * [nrk] Fix extraction * [cda] Fix extraction (#17803, #24458, #24518, #26381) version 2020.11.24 Core + [extractor/common] Add generic support for akamai HTTP format extraction Extractors * [youtube:tab] Fix feeds extraction (#25695, #26452) * [youtube:favorites] Restore extractor * [youtube:tab] Fix some weird typo (#27157) + [pinterest] Add support for large collections (more than 25 pins) + [franceinter] Extract thumbnail (#27153) + [box] Add support for box.com (#5949) + [nytimes] Add support for cooking.nytimes.com (#27112, #27143) * [lbry] Relax URL regular expression (#27144) + [rumble] Add support for embed pages (#10785) + [skyit] Add support for multiple Sky Italia websites (#26629) + [pinterest] Add support for pinterest.com (#25747) version 2020.11.21.1 Core * [downloader/http] Fix crash during urlopen caused by missing reason of URLError * [YoutubeDL] Fix --ignore-errors for playlists with generator-based entries of url_transparent (#27064) Extractors + [svtplay] Add support for svt.se/barnkanalen (#24817) + [svt] Extract timestamp (#27130) * [svtplay] Improve thumbnail extraction (#27130) * [youtube] Fix error reason extraction (#27081) * [youtube] Fix like and dislike count extraction (#25977) + [youtube:tab] Add support for current video and fix lives extraction (#27126) * [infoq] Fix format extraction (#25984) * [francetv] Update to fix thumbnail URL issue (#27120) * [youtube] Improve yt initial data extraction (#27093) + [discoverynetworks] Add support new TLC/DMAX URLs (#27100) * [rai] Fix protocol relative relinker URLs (#22766) * [rai] Fix unavailable video format detection * [rai] Improve extraction * [rai] Fix extraction (#27077) * [viki] Improve format extraction * [viki] Fix stream extraction from MPD (#27092) * [googledrive] Fix format extraction (#26979) + [amara] Add support for amara.org (#20618) * [vimeo:album] Fix extraction (#27079) * [mtv] Fix mgid extraction (#26841) version 2020.11.19 Core * [extractor/common] Output error for invalid URLs in _is_valid_url (#21400, #24151, #25617, #25618, #25586, #26068, #27072) Extractors * [youporn] Fix upload date extraction * [youporn] Make comment count optional (#26986) * [arte] Rework extractors * Reimplement embed and playlist extractors to delegate to the single entrypoint artetv extractor * Improve embeds detection (#27057) + [arte] Extract m3u8 formats (#27061) * [mgtv] Fix format extraction (#26415) + [lbry] Add support for odysee.com (#26806) * [francetv] Improve info extraction + [francetv] Add fallback video URL extraction (#27047) version 2020.11.18 Extractors * [spiegel] Fix extraction (#24206, #24767) * [youtube] Improve extraction + Add support for --no-playlist (#27009) * Improve playlist and mix extraction (#26390, #26509, #26534, #27011) + Extract playlist uploader data * [youtube:tab] Fix view count extraction (#27051) * [malltv] Fix extraction (#27035) + [bandcamp] Extract playlist description (#22684) * [urplay] Fix extraction (#26828) * [youtube:tab] Fix playlist title extraction (#27015) * [youtube] Fix chapters extraction (#26005) version 2020.11.17 Core * [utils] Skip ! prefixed code in js_to_json Extractors * [youtube:tab] Fix extraction with cookies provided (#27005) * [lrt] Fix extraction with empty tags (#20264) + [ndr:embed:base] Extract subtitles (#25447, #26106) + [servus] Add support for pm-wissen.com (#25869) * [servus] Fix extraction (#26872, #26967, #26983, #27000) * [xtube] Fix extraction (#26996) * [lrt] Fix extraction + [lbry] Add support for lbry.tv + [condenast] Extract subtitles * [condenast] Fix extraction * [bandcamp] Fix extraction (#26681, #26684) * [rai] Fix RaiPlay extraction (#26064, #26096) * [vlive] Fix extraction * [usanetwork] Fix extraction * [nbc] Fix NBCNews/Today/MSNBC extraction * [cnbc] Fix extraction version 2020.11.12 Extractors * [youtube] Rework extractors version 2020.11.01 Core * [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851) * [downloader/http] Properly handle missing message in SSLError (#26646) * [downloader/http] Fix access to not yet opened stream in retry Extractors * [youtube] Fix JS player URL extraction * [ytsearch] Fix extraction (#26920) * [afreecatv] Fix typo (#26970) * [23video] Relax URL regular expression (#26870) + [ustream] Add support for video.ibm.com (#26894) * [iqiyi] Fix typo (#26884) + [expressen] Add support for di.se (#26670) * [iprima] Improve video id extraction (#26507, #26494) version 2020.09.20 Core * [extractor/common] Relax interaction count extraction in _json_ld + [extractor/common] Extract author as uploader for VideoObject in _json_ld * [downloader/hls] Fix incorrect end byte in Range HTTP header for media segments with EXT-X-BYTERANGE (#14748, #24512) * [extractor/common] Handle ssl.CertificateError in _request_webpage (#26601) * [downloader/http] Improve timeout detection when reading block of data (#10935) * [downloader/http] Retry download when urlopen times out (#10935, #26603) Extractors * [redtube] Extend URL regular expression (#26506) * [twitch] Refactor * [twitch:stream] Switch to GraphQL and fix reruns (#26535) + [telequebec] Add support for brightcove videos (#25833) * [pornhub] Extract metadata from JSON-LD (#26614) * [pornhub] Fix view count extraction (#26621, #26614) version 2020.09.14 Core + [postprocessor/embedthumbnail] Add support for non jpg/png thumbnails (#25687, #25717) Extractors * [rtlnl] Extend URL regular expression (#26549, #25821) * [youtube] Fix empty description extraction (#26575, #26006) * [srgssr] Extend URL regular expression (#26555, #26556, #26578) * [googledrive] Use redirect URLs for source format (#18877, #23919, #24689, #26565) * [svtplay] Fix id extraction (#26576) * [redbulltv] Improve support for rebull.com TV localized URLs (#22063) + [redbulltv] Add support for new redbull.com TV URLs (#22037, #22063) * [soundcloud:pagedplaylist] Reduce pagination limit (#26557) version 2020.09.06 Core + [utils] Recognize wav mimetype (#26463) Extractors * [nrktv:episode] Improve video id extraction (#25594, #26369, #26409) * [youtube] Fix age gate content detection (#26100, #26152, #26311, #26384) * [youtube:user] Extend URL regular expression (#26443) * [xhamster] Improve initials regular expression (#26526, #26353) * [svtplay] Fix video id extraction (#26425, #26428, #26438) * [twitch] Rework extractors (#12297, #20414, #20604, #21811, #21812, #22979, #24263, #25010, #25553, #25606) * Switch to GraphQL + Add support for collections + Add support for clips and collections playlists * [biqle] Improve video ext extraction * [xhamster] Fix extraction (#26157, #26254) * [xhamster] Extend URL regular expression (#25789, #25804, #25927)) version 2020.07.28 Extractors * [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) * [youtube] Improve description extraction (#25937, #25980) * [wistia] Restrict embed regular expression (#25969) * [youtube] Prevent excess HTTP 301 (#25786) + [youtube:playlists] Extend URL regular expression (#25810) + [bellmedia] Add support for cp24.com clip URLs (#25764) * [brightcove] Improve embed detection (#25674) version 2020.06.16.1 Extractors * [youtube] Force old layout (#25682, #25683, #25680, #25686) * [youtube] Fix categories and improve tags extraction version 2020.06.16 Extractors * [youtube] Fix uploader id and uploader URL extraction * [youtube] Improve view count extraction * [youtube] Fix upload date extraction (#25677) * [youtube] Fix thumbnails extraction (#25676) * [youtube] Fix playlist and feed extraction (#25675) + [facebook] Add support for single-video ID links + [youtube] Extract chapters from JSON (#24819) + [kaltura] Add support for multiple embeds on a webpage (#25523) version 2020.06.06 Extractors * [tele5] Bypass geo restriction + [jwplatform] Add support for bypass geo restriction * [tele5] Prefer jwplatform over nexx (#25533) * [twitch:stream] Expect 400 and 410 HTTP errors from API * [twitch:stream] Fix extraction (#25528) * [twitch] Fix thumbnails extraction (#25531) + [twitch] Pass v5 Accept HTTP header (#25531) * [brightcove] Fix subtitles extraction (#25540) + [malltv] Add support for sk.mall.tv (#25445) * [periscope] Fix untitled broadcasts (#25482) * [jwplatform] Improve embeds extraction (#25467) version 2020.05.29 Core * [postprocessor/ffmpeg] Embed series metadata with --add-metadata * [utils] Fix file permissions in write_json_file (#12471, #25122) Extractors * [ard:beta] Extend URL regular expression (#25405) + [youtube] Add support for more invidious instances (#25417) * [giantbomb] Extend URL regular expression (#25222) * [ard] Improve URL regular expression (#25134, #25198) * [redtube] Improve formats extraction and extract m3u8 formats (#25311, #25321) * [indavideo] Switch to HTTPS for API request (#25191) * [redtube] Improve title extraction (#25208) * [vimeo] Improve format extraction and sorting (#25285) * [soundcloud] Reduce API playlist page limit (#25274) + [youtube] Add support for yewtu.be (#25226) * [mailru] Fix extraction (#24530, #25239) * [bellator] Fix mgid extraction (#25195) version 2020.05.08 Core * [downloader/http] Request last data block of exact remaining size * [downloader/http] Finish downloading once received data length matches expected * [extractor/common] Use compat_cookiejar_Cookie for _set_cookie to always ensure cookie name and value are bytestrings on python 2 (#23256, #24776) + [compat] Introduce compat_cookiejar_Cookie * [utils] Improve cookie files support + Add support for UTF-8 in cookie files * Skip malformed cookie file entries instead of crashing (invalid entry length, invalid expires at) Extractors * [youtube] Improve signature cipher extraction (#25187, #25188) * [iprima] Improve extraction (#25138) * [uol] Fix extraction (#22007) + [orf] Add support for more radio stations (#24938, #24968) * [dailymotion] Fix typo - [puhutv] Remove no longer available HTTP formats (#25124) version 2020.05.03 Core + [extractor/common] Extract multiple JSON-LD entries * [options] Clarify doc on --exec command (#19087, #24883) * [extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667) Extractors * [crunchyroll] Fix and improve extraction (#25096, #25060) * [youtube] Improve player id extraction * [youtube] Use redirected video id if any (#25063) * [yahoo] Fix GYAO Player extraction and relax URL regular expression (#24178, #24778) * [tvplay] Fix Viafree extraction (#15189, #24473, #24789) * [tenplay] Relax URL regular expression (#25001) + [prosiebensat1] Extract series metadata * [prosiebensat1] Improve extraction and remove 7tv.de support (#24948) - [prosiebensat1] Remove 7tv.de support (#24948) * [youtube] Fix DRM videos detection (#24736) * [thisoldhouse] Fix video id extraction (#24548, #24549) + [soundcloud] Extract AAC format (#19173, #24708) * [youtube] Skip broken multifeed videos (#24711) * [nova:embed] Fix extraction (#24700) * [motherless] Fix extraction (#24699) * [twitch:clips] Extend URL regular expression (#24290, #24642) * [tv4] Fix ISM formats extraction (#24667) * [tele5] Fix extraction (#24553) + [mofosex] Add support for generic embeds (#24633) + [youporn] Add support for generic embeds + [spankwire] Add support for generic embeds (#24633) * [spankwire] Fix extraction (#18924, #20648) version 2020.03.24 Core - [utils] Revert support for cookie files with spaces used instead of tabs Extractors * [teachable] Update upskillcourses and gns3 domains * [generic] Look for teachable embeds before wistia + [teachable] Extract chapter metadata (#24421) + [bilibili] Add support for player.bilibili.com (#24402) + [bilibili] Add support for new URL schema with BV ids (#24439, #24442) * [limelight] Remove disabled API requests (#24255) * [soundcloud] Fix download URL extraction (#24394) + [cbc:watch] Add support for authentication (#19160) * [hellporno] Fix extraction (#24399) * [xtube] Fix formats extraction (#24348) * [ndr] Fix extraction (#24326) * [nhk] Update m3u8 URL and use native HLS downloader (#24329) - [nhk] Remove obsolete rtmp formats (#24329) * [nhk] Relax URL regular expression (#24329) - [vimeo] Revert fix showcase password protected video extraction (#24224) version 2020.03.08 Core + [utils] Add support for cookie files with spaces used instead of tabs Extractors + [pornhub] Add support for pornhubpremium.com (#24288) - [youtube] Remove outdated code and unnecessary requests * [youtube] Improve extraction in 429 HTTP error conditions (#24283) * [nhk] Update API version (#24270) version 2020.03.06 Extractors * [youtube] Fix age-gated videos support without login (#24248) * [vimeo] Fix showcase password protected video extraction (#24224) * [pornhub] Improve title extraction (#24184) * [peertube] Improve extraction (#23657) + [servus] Add support for new URL schema (#23475, #23583, #24142) * [vimeo] Fix subtitles URLs (#24209) version 2020.03.01 Core * [YoutubeDL] Force redirect URL to unicode on python 2 - [options] Remove duplicate short option -v for --version (#24162) Extractors * [xhamster] Fix extraction (#24205) * [franceculture] Fix extraction (#24204) + [telecinco] Add support for article opening videos * [telecinco] Fix extraction (#24195) * [xtube] Fix metadata extraction (#21073, #22455) * [youjizz] Fix extraction (#24181) - Remove no longer needed compat_str around geturl * [pornhd] Fix extraction (#24128) + [teachable] Add support for multiple videos per lecture (#24101) + [wistia] Add support for multiple generic embeds (#8347, 11385) * [imdb] Fix extraction (#23443) * [tv2dk:bornholm:play] Fix extraction (#24076) version 2020.02.16 Core * [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591, #10622) * [update] Fix updating via symlinks (#23991) + [compat] Introduce compat_realpath (#23991) Extractors + [npr] Add support for streams (#24042) + [24video] Add support for porn.24video.net (#23779, #23784) - [jpopsuki] Remove extractor (#23858) * [nova] Improve extraction (#23690) * [nova:embed] Improve (#23690) * [nova:embed] Fix extraction (#23672) + [abc:iview] Add support for 720p (#22907, #22921) * [nytimes] Improve format sorting (#24010) + [toggle] Add support for mewatch.sg (#23895, #23930) * [thisoldhouse] Fix extraction (#23951) + [popcorntimes] Add support for popcorntimes.tv (#23949) * [sportdeutschland] Update to new API * [twitch:stream] Lowercase channel id for stream request (#23917) * [tv5mondeplus] Fix extraction (#23907, #23911) * [tva] Relax URL regular expression (#23903) * [vimeo] Fix album extraction (#23864) * [viewlift] Improve extraction * Fix extraction (#23851) + Add support for authentication + Add support for more domains * [svt] Fix series extraction (#22297) * [svt] Fix article extraction (#22897, #22919) * [soundcloud] Improve private playlist/set tracks extraction (#3707) version 2020.01.24 Extractors * [youtube] Fix sigfunc name extraction (#23819) * [stretchinternet] Fix extraction (#4319) * [voicerepublic] Fix extraction * [azmedien] Fix extraction (#23783) * [businessinsider] Fix jwplatform id extraction (#22929, #22954) + [24video] Add support for 24video.vip (#23753) * [ivi:compilation] Fix entries extraction (#23770) * [ard] Improve extraction (#23761) * Simplify extraction + Extract age limit and series * Bypass geo-restriction + [nbc] Add support for nbc multi network URLs (#23049) * [americastestkitchen] Fix extraction * [zype] Improve extraction + Extract subtitles (#21258) + Support URLs with alternative keys/tokens (#21258) + Extract more metadata * [orf:tvthek] Improve geo restricted videos detection (#23741) * [soundcloud] Restore previews extraction (#23739) version 2020.01.15 Extractors * [yourporn] Fix extraction (#21645, #22255, #23459) + [canvas] Add support for new API endpoint (#17680, #18629) * [ndr:base:embed] Improve thumbnails extraction (#23731) + [vodplatform] Add support for embed.kwikmotion.com domain + [twitter] Add support for promo_video_website cards (#23711) * [orf:radio] Clean description and improve extraction * [orf:fm4] Fix extraction (#23599) * [safari] Fix kaltura session extraction (#23679, #23670) * [lego] Fix extraction and extract subtitle (#23687) * [cloudflarestream] Improve extraction + Add support for bytehighway.net domain + Add support for signed URLs + Extract thumbnail * [naver] Improve extraction * Improve geo-restriction handling + Extract automatic captions + Extract uploader metadata + Extract VLive HLS formats * Improve metadata extraction - [pandatv] Remove extractor (#23630) * [dctp] Fix format extraction (#23656) + [scrippsnetworks] Add support for www.discovery.com videos * [discovery] Fix anonymous token extraction (#23650) * [nrktv:seriebase] Fix extraction (#23625, #23537) * [wistia] Improve format extraction and extract subtitles (#22590) * [vice] Improve extraction (#23631) * [redtube] Detect private videos (#23518) version 2020.01.01 Extractors * [brightcove] Invalidate policy key cache on failing requests * [pornhub] Improve locked videos detection (#22449, #22780) + [pornhub] Add support for m3u8 formats * [pornhub] Fix extraction (#22749, #23082) * [brightcove] Update policy key on failing requests * [spankbang] Improve removed video detection (#23423) * [spankbang] Fix extraction (#23307, #23423, #23444) * [soundcloud] Automatically update client id on failing requests * [prosiebensat1] Improve geo restriction handling (#23571) * [brightcove] Cache brightcove player policy keys * [teachable] Fail with error message if no video URL found * [teachable] Improve locked lessons detection (#23528) + [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) * [mitele] Fix extraction (#21354, #23456) * [soundcloud] Update client id (#23516) * [mailru] Relax URL regular expressions (#23509) version 2019.12.25 Core * [utils] Improve str_to_int + [downloader/hls] Add ability to override AES decryption key URL (#17521) Extractors * [mediaset] Fix parse formats (#23508) + [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291) + [slideslive] Add support for url and vimeo service names (#23414) * [slideslive] Fix extraction (#23413) * [twitch:clips] Fix extraction (#23375) + [soundcloud] Add support for token protected embeds (#18954) * [vk] Improve extraction * Fix User Videos extraction (#23356) * Extract all videos for lists with more than 1000 videos (#23356) + Add support for video albums (#14327, #14492) - [kontrtube] Remove extractor - [videopremium] Remove extractor - [musicplayon] Remove extractor (#9225) + [ufctv] Add support for ufcfightpass.imgdge.com and ufcfightpass.imggaming.com (#23343) + [twitch] Extract m3u8 formats frame rate (#23333) + [imggaming] Add support for playlists and extract subtitles + [ufcarabia] Add support for UFC Arabia (#23312) * [ufctv] Fix extraction * [yahoo] Fix gyao brightcove player id (#23303) * [vzaar] Override AES decryption key URL (#17521) + [vzaar] Add support for AES HLS manifests (#17521, #23299) * [nrl] Fix extraction * [teachingchannel] Fix extraction * [nintendo] Fix extraction and partially add support for Nintendo Direct videos (#4592) + [ooyala] Add better fallback values for domain and streams variables + [youtube] Add support youtubekids.com (#23272) * [tv2] Detect DRM protection + [tv2] Add support for katsomo.fi and mtv.fi (#10543) * [tv2] Fix tv2.no article extraction * [msn] Improve extraction + Add support for YouTube and NBCSports embeds + Add support for articles with multiple videos * Improve AOL embed support * Improve format extraction * [abcotvs] Relax URL regular expression and improve metadata extraction (#18014) * [channel9] Reduce response size * [adobetv] Improve extraction * Use OnDemandPagedList for list extractors * Reduce show extraction requests * Extract original video format and subtitles + Add support for adobe tv embeds version 2019.11.28 Core + [utils] Add generic caesar cipher and rot47 * [utils] Handle rd-suffixed day parts in unified_strdate (#23199) Extractors * [vimeo] Improve extraction * Fix review extraction * Fix ondemand extraction * Make password protected player case as an expected error (#22896) * Simplify channel based extractors code - [openload] Remove extractor (#11999) - [verystream] Remove extractor - [streamango] Remove extractor (#15406) * [dailymotion] Improve extraction * Extract http formats included in m3u8 manifest * Fix user extraction (#3553, #21415) + Add support for User Authentication (#11491) * Fix password protected videos extraction (#23176) * Respect age limit option and family filter cookie value (#18437) * Handle video url playlist query param * Report allowed countries for geo-restricted videos * [corus] Improve extraction + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com and disneylachaine.ca (#20861) + Add support for self hosted videos (#22075) * Detect DRM protection (#14910, #9164) * [vivo] Fix extraction (#22328, #22279) + [bitchute] Extract upload date (#22990, #23193) * [soundcloud] Update client id (#23214) version 2019.11.22 Core + [extractor/common] Clean jwplayer description HTML tags + [extractor/common] Add data, headers and query to all major extract formats methods Extractors * [chaturbate] Fix extraction (#23010, #23012) + [ntvru] Add support for non relative file URLs (#23140) * [vk] Fix wall audio thumbnails extraction (#23135) * [ivi] Fix format extraction (#21991) - [comcarcoff] Remove extractor + [drtv] Add support for new URL schema (#23059) + [nexx] Add support for Multi Player JS Setup (#23052) + [teamcoco] Add support for new videos (#23054) * [soundcloud] Check if the soundtrack has downloads left (#23045) * [facebook] Fix posts video data extraction (#22473) - [addanime] Remove extractor - [minhateca] Remove extractor - [daisuki] Remove extractor * [seeker] Fix extraction - [revision3] Remove extractors * [twitch] Fix video comments URL (#18593, #15828) * [twitter] Improve extraction + Add support for generic embeds (#22168) * Always extract http formats for native videos (#14934) + Add support for Twitter Broadcasts (#21369) + Extract more metadata * Improve VMap format extraction * Unify extraction code for both twitter statuses and cards + [twitch] Add support for Clip embed URLs * [lnkgo] Fix extraction (#16834) * [mixcloud] Improve extraction * Improve metadata extraction (#11721) * Fix playlist extraction (#22378) * Fix user mixes extraction (#15197, #17865) + [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384) * [onionstudios] Fix extraction + [hotstar] Pass Referer header to format requests (#22836) * [dplay] Minimize response size + [patreon] Extract uploader_id and filesize * [patreon] Minimize response size * [roosterteeth] Fix login request (#16094, #22689) version 2019.11.05 Extractors + [scte] Add support for learning.scte.org (#22975) + [msn] Add support for Vidible and AOL embeds (#22195, #22227) * [myspass] Fix video URL extraction and improve metadata extraction (#22448) * [jamendo] Improve extraction * Fix album extraction (#18564) * Improve metadata extraction (#18565, #21379) * [mediaset] Relax URL guid matching (#18352) + [mediaset] Extract unprotected M3U and MPD manifests (#17204) * [telegraaf] Fix extraction + [bellmedia] Add support for marilyn.ca videos (#22193) * [stv] Fix extraction (#22928) - [iconosquare] Remove extractor - [keek] Remove extractor - [gameone] Remove extractor (#21778) - [flipagram] Remove extractor - [bambuser] Remove extractor * [wistia] Reduce embed extraction false positives + [wistia] Add support for inline embeds (#22931) - [go90] Remove extractor * [kakao] Remove raw request + [kakao] Extract format total bitrate * [daum] Fix VOD and Clip extraction (#15015) * [kakao] Improve extraction + Add support for embed URLs + Add support for Kakao Legacy vid based embed URLs * Only extract fields used for extraction * Strip description and extract tags * [mixcloud] Fix cloudcast data extraction (#22821) * [yahoo] Improve extraction + Add support for live streams (#3597, #3779, #22178) * Bypass cookie consent page for european domains (#16948, #22576) + Add generic support for embeds (#20332) * [tv2] Fix and improve extraction (#22787) + [tv2dk] Add support for TV2 DK sites * [onet] Improve extraction … + Add support for onet100.vod.pl + Extract m3u8 formats * Correct audio only format info * [fox9] Fix extraction version 2019.10.29 Core * [utils] Actualize major IPv4 address blocks per country Extractors + [go] Add support for abc.com and freeform.com (#22823, #22864) + [mtv] Add support for mtvjapan.com * [mtv] Fix extraction for mtv.de (#22113) * [videodetective] Fix extraction * [internetvideoarchive] Fix extraction * [nbcnews] Fix extraction (#12569, #12576, #21703, #21923) - [hark] Remove extractor - [tutv] Remove extractor - [learnr] Remove extractor - [macgamestore] Remove extractor * [la7] Update Kaltura service URL (#22358) * [thesun] Fix extraction (#16966) - [makertv] Remove extractor + [tenplay] Add support for 10play.com.au (#21446) * [soundcloud] Improve extraction * Improve format extraction (#22123) + Extract uploader_id and uploader_url (#21916) + Extract all known thumbnails (#19071, #20659) * Fix extraction for private playlists (#20976) + Add support for playlist embeds (#20976) * Skip preview formats (#22806) * [dplay] Improve extraction + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969) * Fix it.dplay.com extraction (#22826) + Extract creator, tags and thumbnails * Handle playback API call errors + [discoverynetworks] Add support for dplay.co.uk * [vk] Improve extraction + Add support for Odnoklassniki embeds + Extract more videos from user lists (#4470) + Fix wall post audio extraction (#18332) * Improve error detection (#22568) + [odnoklassniki] Add support for embeds * [puhutv] Improve extraction * Fix subtitles extraction * Transform HLS URLs to HTTP URLs * Improve metadata extraction * [ceskatelevize] Skip DRM media + [facebook] Extract subtitles (#22777) * [globo] Handle alternative hash signing method version 2019.10.22 Core * [utils] Improve subtitles_filename (#22753) Extractors * [facebook] Bypass download rate limits (#21018) + [contv] Add support for contv.com - [viewster] Remove extractor * [xfileshare] Improve extractor (#17032, #17906, #18237, #18239) * Update the list of domains + Add support for aa-encoded video data * Improve jwplayer format extraction + Add support for Clappr sources * [mangomolo] Fix video format extraction and add support for player URLs * [audioboom] Improve metadata extraction * [twitch] Update VOD URL matching (#22395, #22727) - [mit] Remove support for video.mit.edu (#22403) - [servingsys] Remove extractor (#22639) * [dumpert] Fix extraction (#22428, #22564) * [atresplayer] Fix extraction (#16277, #16716) version 2019.10.16 Core * [extractor/common] Make _is_valid_url more relaxed Extractors * [vimeo] Improve album videos id extraction (#22599) + [globo] Extract subtitles (#22713) * [bokecc] Improve player params extraction (#22638) * [nexx] Handle result list (#22666) * [vimeo] Fix VHX embed extraction * [nbc] Switch to graphql API (#18581, #22693, #22701) - [vessel] Remove extractor - [promptfile] Remove extractor (#6239) * [kaltura] Fix service URL extraction (#22658) * [kaltura] Fix embed info strip (#22658) * [globo] Fix format extraction (#20319) * [redtube] Improve metadata extraction (#22492, #22615) * [pornhub:uservideos:upload] Fix extraction (#22619) + [telequebec:squat] Add support for squat.telequebec.tv (#18503) - [wimp] Remove extractor (#22088, #22091) + [gfycat] Extend URL regular expression (#22225) + [chaturbate] Extend URL regular expression (#22309) * [peertube] Update instances (#22414) + [telequebec] Add support for coucou.telequebec.tv (#22482) + [xvideos] Extend URL regular expression (#22471) - [youtube] Remove support for invidious.enkirton.net (#22543) + [openload] Add support for oload.monster (#22592) * [nrktv:seriebase] Fix extraction (#22596) + [youtube] Add support for yt.lelux.fi (#22597) * [orf:tvthek] Make manifest requests non fatal (#22578) * [teachable] Skip login when already logged in (#22572) * [viewlift] Improve extraction (#22545) * [nonktube] Fix extraction (#22544) version 2019.09.28 Core * [YoutubeDL] Honour all --get-* options with --flat-playlist (#22493) Extractors * [vk] Fix extraction (#22522) * [heise] Fix kaltura embeds extraction (#22514) * [ted] Check for resources validity and extract subtitled downloads (#22513) + [youtube] Add support for owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya.b32.i2p (#22292) + [nhk] Add support for clips * [nhk] Fix video extraction (#22249, #22353) * [byutv] Fix extraction (#22070) + [openload] Add support for oload.online (#22304) + [youtube] Add support for invidious.drycat.fr (#22451) * [jwplatfom] Do not match video URLs (#20596, #22148) * [youtube:playlist] Unescape playlist uploader (#22483) + [bilibili] Add support audio albums and songs (#21094) + [instagram] Add support for tv URLs + [mixcloud] Allow uppercase letters in format URLs (#19280) * [brightcove] Delegate all supported legacy URLs to new extractor (#11523, #12842, #13912, #15669, #16303) * [hotstar] Use native HLS downloader by default + [hotstar] Extract more formats (#22323) * [9now] Fix extraction (#22361) * [zdf] Bypass geo restriction + [tv4] Extract series metadata * [tv4] Fix extraction (#22443) version 2019.09.12.1 Extractors * [youtube] Remove quality and tbr for itag 43 (#22372) version 2019.09.12 Extractors * [youtube] Quick extraction tempfix (#22367, #22163) version 2019.09.01 Core + [extractor/generic] Add support for squarespace embeds (#21294, #21802, #21859) + [downloader/external] Respect mtime option for aria2c (#22242) Extractors + [xhamster:user] Add support for user pages (#16330, #18454) + [xhamster] Add support for more domains + [verystream] Add support for woof.tube (#22217) + [dailymotion] Add support for lequipe.fr (#21328, #22152) + [openload] Add support for oload.vip (#22205) + [bbccouk] Extend URL regular expression (#19200) + [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) * [safari] Fix authentication (#22161, #22184) * [usanetwork] Fix extraction (#22105) + [einthusan] Add support for einthusan.ca (#22171) * [youtube] Improve unavailable message extraction (#22117) + [piksel] Extract subtitles (#20506) version 2019.08.13 Core * [downloader/fragment] Fix ETA calculation of resumed download (#21992) * [YoutubeDL] Check annotations availability (#18582) Extractors * [youtube:playlist] Improve flat extraction (#21927) * [youtube] Fix annotations extraction (#22045) + [discovery] Extract series meta field (#21808) * [youtube] Improve error detection (#16445) * [vimeo] Fix album extraction (#1933, #15704, #15855, #18967, #21986) + [roosterteeth] Add support for watch URLs * [discovery] Limit video data by show slug (#21980) version 2019.08.02 Extractors + [tvigle] Add support for HLS and DASH formats (#21967) * [tvigle] Fix extraction (#21967) + [yandexvideo] Add support for DASH formats (#21971) * [discovery] Use API call for video data extraction (#21808) + [mgtv] Extract format_note (#21881) * [tvn24] Fix metadata extraction (#21833, #21834) * [dlive] Relax URL regular expression (#21909) + [openload] Add support for oload.best (#21913) * [youtube] Improve metadata extraction for age gate content (#21943) version 2019.07.30 Extractors * [youtube] Fix and improve title and description extraction (#21934) version 2019.07.27 Extractors + [yahoo:japannews] Add support for yahoo.co.jp (#21698, #21265) + [discovery] Add support go.discovery.com URLs * [youtube:playlist] Relax video regular expression (#21844) * [generic] Restrict --default-search schemeless URLs detection pattern (#21842) * [vrv] Fix CMS signing query extraction (#21809) version 2019.07.16 Extractors + [asiancrush] Add support for yuyutv.com, midnightpulp.com and cocoro.tv (#21281, #21290) * [kaltura] Check source format URL (#21290) * [ctsnews] Fix YouTube embeds extraction (#21678) + [einthusan] Add support for einthusan.com (#21748, #21775) + [youtube] Add support for invidious.mastodon.host (#21777) + [gfycat] Extend URL regular expression (#21779, #21780) * [youtube] Restrict is_live extraction (#21782) version 2019.07.14 Extractors * [porn91] Fix extraction (#21312) + [yandexmusic] Extract track number and disk number (#21421) + [yandexmusic] Add support for multi disk albums (#21420, #21421) * [lynda] Handle missing subtitles (#20490, #20513) + [youtube] Add more invidious instances to URL regular expression (#21694) * [twitter] Improve uploader id extraction (#21705) * [spankbang] Fix and improve metadata extraction * [spankbang] Fix extraction (#21763, #21764) + [dlive] Add support for dlive.tv (#18080) + [livejournal] Add support for livejournal.com (#21526) * [roosterteeth] Fix free episode extraction (#16094) * [dbtv] Fix extraction * [bellator] Fix extraction - [rudo] Remove extractor (#18430, #18474) * [facebook] Fallback to twitter:image meta for thumbnail extraction (#21224) * [bleacherreport] Fix Bleacher Report CMS extraction * [espn] Fix fivethirtyeight.com extraction * [5tv] Relax video URL regular expression and support https URLs * [youtube] Fix is_live extraction (#21734) * [youtube] Fix authentication (#11270) version 2019.07.12 Core + [adobepass] Add support for AT&T U-verse (mso ATT) (#13938, #21016) Extractors + [mgtv] Pass Referer HTTP header for format URLs (#21726) + [beeg] Add support for api/v6 v2 URLs without t argument (#21701) * [voxmedia:volume] Improvevox embed extraction (#16846) * [funnyordie] Move extraction to VoxMedia extractor (#16846) * [gameinformer] Fix extraction (#8895, #15363, #17206) * [funk] Fix extraction (#17915) * [packtpub] Relax lesson URL regular expression (#21695) * [packtpub] Fix extraction (#21268) * [philharmoniedeparis] Relax URL regular expression (#21672) * [peertube] Detect embed URLs in generic extraction (#21666) * [mixer:vod] Relax URL regular expression (#21657, #21658) + [lecturio] Add support id based URLs (#21630) + [go] Add site info for disneynow (#21613) * [ted] Restrict info regular expression (#21631) * [twitch:vod] Actualize m3u8 URL (#21538, #21607) * [vzaar] Fix videos with empty title (#21606) * [tvland] Fix extraction (#21384) * [arte] Clean extractor (#15583, #21614) version 2019.07.02 Core + [utils] Introduce random_user_agent and use as default User-Agent (#21546) Extractors + [vevo] Add support for embed.vevo.com URLs (#21565) + [openload] Add support for oload.biz (#21574) * [xiami] Update API base URL (#21575) * [yourporn] Fix extraction (#21585) + [acast] Add support for URLs with episode id (#21444) + [dailymotion] Add support for DM.player embeds * [soundcloud] Update client id version 2019.06.27 Extractors + [go] Add support for disneynow.com (#21528) * [mixer:vod] Relax URL regular expression (#21531, #21536) * [drtv] Relax URL regular expression * [fusion] Fix extraction (#17775, #21269) - [nfb] Remove extractor (#21518) + [beeg] Add support for api/v6 v2 URLs (#21511) + [brightcove:new] Add support for playlists (#21331) + [openload] Add support for oload.life (#21495) * [vimeo:channel,group] Make title extraction non fatal * [vimeo:likes] Implement extrator in terms of channel extractor (#21493) + [pornhub] Add support for more paged video sources + [pornhub] Add support for downloading single pages and search pages (#15570) * [pornhub] Rework extractors (#11922, #16078, #17454, #17936) + [youtube] Add another signature function pattern * [tf1] Fix extraction (#21365, #21372) * [crunchyroll] Move Accept-Language workaround to video extractor since it causes playlists not to list any videos * [crunchyroll:playlist] Fix and relax title extraction (#21291, #21443) version 2019.06.21 Core * [utils] Restrict parse_codecs and add theora as known vcodec (#21381) Extractors * [youtube] Update signature function patterns (#21469, #21476) * [youtube] Make --write-annotations non fatal (#21452) + [sixplay] Add support for rtlmost.hu (#21405) * [youtube] Hardcode codec metadata for av01 video only formats (#21381) * [toutv] Update client key (#21370) + [biqle] Add support for new embed domain * [cbs] Improve DRM protected videos detection (#21339) version 2019.06.08 Core * [downloader/common] Improve rate limit (#21301) * [utils] Improve strip_or_none * [extractor/common] Strip src attribute for HTML5 entries code (#18485, #21169) Extractors * [ted] Fix playlist extraction (#20844, #21032) * [vlive:playlist] Fix video extraction when no playlist is found (#20590) + [vlive] Add CH+ support (#16887, #21209) + [openload] Add support for oload.website (#21329) + [tvnow] Extract HD formats (#21201) + [redbulltv] Add support for rrn:content URLs (#21297) * [youtube] Fix average rating extraction (#21304) + [bitchute] Extract HTML5 formats (#21306) * [cbsnews] Fix extraction (#9659, #15397) * [vvvvid] Relax URL regular expression (#21299) + [prosiebensat1] Add support for new API (#21272) + [vrv] Extract adaptive_hls formats (#21243) * [viki] Switch to HTTPS (#21001) * [LiveLeak] Check if the original videos exist (#21206, #21208) * [rtp] Fix extraction (#15099) * [youtube] Improve DRM protected videos detection (#1774) + [srgssrplay] Add support for popupvideoplayer URLs (#21155) + [24video] Add support for porno.24video.net (#21194) + [24video] Add support for 24video.site (#21193) - [pornflip] Remove extractor - [criterion] Remove extractor (#21195) * [pornhub] Use HTTPS (#21061) * [bitchute] Fix uploader extraction (#21076) * [streamcloud] Reduce waiting time to 6 seconds (#21092) - [novamov] Remove extractors (#21077) + [openload] Add support for oload.press (#21135) * [vivo] Fix extraction (#18906, #19217) version 2019.05.20 Core + [extractor/common] Move workaround for applying first Set-Cookie header into a separate _apply_first_set_cookie_header method Extractors * [safari] Fix authentication (#21090) * [vk] Use _apply_first_set_cookie_header * [vrt] Fix extraction (#20527) + [canvas] Add support for vrtnieuws and sporza site ids and extract AES HLS formats + [vrv] Extract captions (#19238) * [tele5] Improve video id extraction * [tele5] Relax URL regular expression (#21020, #21063) * [svtplay] Update API URL (#21075) + [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) version 2019.05.11 Core * [utils] Transliterate "þ" as "th" (#20897) Extractors + [cloudflarestream] Add support for videodelivery.net (#21049) + [byutv] Add support for DVR videos (#20574, #20676) + [gfycat] Add support for URLs with tags (#20696, #20731) + [openload] Add support for verystream.com (#20701, #20967) * [youtube] Use sp field value for signature field name (#18841, #18927, #21028) + [yahoo:gyao] Extend URL regular expression (#21008) * [youtube] Fix channel id extraction (#20982, #21003) + [sky] Add support for news.sky.com (#13055) + [youtube:entrylistbase] Retry on 5xx HTTP errors (#20965) + [francetvinfo] Extend video id extraction (#20619, #20740) * [4tube] Update token hosts (#20918) * [hotstar] Move to API v2 (#20931) * [fox] Fix API error handling under python 2 (#20925) + [redbulltv] Extend URL regular expression (#20922) version 2019.04.30 Extractors * [openload] Use real Chrome versions (#20902) - [youtube] Remove info el for get_video_info request * [youtube] Improve extraction robustness - [dramafever] Remove extractor (#20868) * [adn] Fix subtitle extraction (#12724) + [ccc] Extract creator (#20355) + [ccc:playlist] Add support for media.ccc.de playlists (#14601, #20355) + [sverigesradio] Add support for sverigesradio.se (#18635) + [cinemax] Add support for cinemax.com * [sixplay] Try extracting non-DRM protected manifests (#20849) + [youtube] Extract Youtube Music Auto-generated metadata (#20599, #20742) - [wrzuta] Remove extractor (#20684, #20801) * [twitch] Prefer source format (#20850) + [twitcasting] Add support for private videos (#20843) * [reddit] Validate thumbnail URL (#20030) * [yandexmusic] Fix track URL extraction (#20820) version 2019.04.24 Extractors * [youtube] Fix extraction (#20758, #20759, #20761, #20762, #20764, #20766, #20767, #20769, #20771, #20768, #20770) * [toutv] Fix extraction and extract series info (#20757) + [vrv] Add support for movie listings (#19229) + [youtube] Print error when no data is available (#20737) + [soundcloud] Add support for new rendition and improve extraction (#20699) + [ooyala] Add support for geo verification proxy + [nrl] Add support for nrl.com (#15991) + [vimeo] Extract live archive source format (#19144) + [vimeo] Add support for live streams and improve info extraction (#19144) + [ntvcojp] Add support for cu.ntv.co.jp + [nhk] Extract RTMPT format + [nhk] Add support for audio URLs + [udemy] Add another course id extraction pattern (#20491) + [openload] Add support for oload.services (#20691) + [openload] Add support for openloed.co (#20691, #20693) * [bravotv] Fix extraction (#19213) version 2019.04.17 Extractors * [openload] Randomize User-Agent (#20688) + [openload] Add support for oladblock domains (#20471) * [adn] Fix subtitle extraction (#12724) + [aol] Add support for localized websites + [yahoo] Add support GYAO episode URLs + [yahoo] Add support for streaming.yahoo.co.jp (#5811, #7098) + [yahoo] Add support for gyao.yahoo.co.jp * [aenetworks] Fix history topic extraction and extract more formats + [cbs] Extract smpte and vtt subtitles + [streamango] Add support for streamcherry.com (#20592) + [yourporn] Add support for sxyprn.com (#20646) * [mgtv] Fix extraction (#20650) * [linkedin:learning] Use urljoin for form action URL (#20431) + [gdc] Add support for kaltura embeds (#20575) * [dispeak] Improve mp4 bitrate extraction * [kaltura] Sanitize embed URLs * [jwplatfom] Do not match manifest URLs (#20596) * [aol] Restrict URL regular expression and improve format extraction + [tiktok] Add support for new URL schema (#20573) + [stv:player] Add support for player.stv.tv (#20586) version 2019.04.07 Core + [downloader/external] Pass rtmp_conn to ffmpeg Extractors + [ruutu] Add support for audio podcasts (#20473, #20545) + [xvideos] Extract all thumbnails (#20432) + [platzi] Add support for platzi.com (#20562) * [dvtv] Fix extraction (#18514, #19174) + [vrv] Add basic support for individual movie links (#19229) + [bfi:player] Add support for player.bfi.org.uk (#19235) * [hbo] Fix extraction and extract subtitles (#14629, #13709) * [youtube] Extract srv[1-3] subtitle formats (#20566) * [adultswim] Fix extraction (#18025) * [teamcoco] Fix extraction and add support for subdomains (#17099, #20339) * [adn] Fix subtitle compatibility with ffmpeg * [adn] Fix extraction and add support for positioning styles (#20549) * [vk] Use unique video id (#17848) * [newstube] Fix extraction * [rtl2] Actualize extraction + [adobeconnect] Add support for adobeconnect.com (#20283) + [gaia] Add support for authentication (#14605) + [mediasite] Add support for dashed ids and named catalogs (#20531) version 2019.04.01 Core * [utils] Improve int_or_none and float_or_none (#20403) * Check for valid --min-sleep-interval when --max-sleep-interval is specified (#20435) Extractors + [weibo] Extend URL regular expression (#20496) + [xhamster] Add support for xhamster.one (#20508) + [mediasite] Add support for catalogs (#20507) + [teamtreehouse] Add support for teamtreehouse.com (#9836) + [ina] Add support for audio URLs * [ina] Improve extraction * [cwtv] Fix episode number extraction (#20461) * [npo] Improve DRM detection + [pornhub] Add support for DASH formats (#20403) * [svtplay] Update API endpoint (#20430) version 2019.03.18 Core * [extractor/common] Improve HTML5 entries extraction + [utils] Introduce parse_bitrate * [update] Hide update URLs behind redirect * [extractor/common] Fix url meta field for unfragmented DASH formats (#20346) Extractors + [yandexvideo] Add extractor * [openload] Improve embed detection + [corus] Add support for bigbrothercanada.ca (#20357) + [orf:radio] Extract series (#20012) + [cbc:watch] Add support for gem.cbc.ca (#20251, #20359) - [anysex] Remove extractor (#19279) + [ciscolive] Add support for new URL schema (#20320, #20351) + [youtube] Add support for invidiou.sh (#20309) - [anitube] Remove extractor (#20334) - [ruleporn] Remove extractor (#15344, #20324) * [npr] Fix extraction (#10793, #13440) * [biqle] Fix extraction (#11471, #15313) * [viddler] Modernize * [moevideo] Fix extraction * [primesharetv] Remove extractor * [hypem] Modernize and extract more metadata (#15320) * [veoh] Fix extraction * [escapist] Modernize - [videomega] Remove extractor (#10108) + [beeg] Add support for beeg.porn (#20306) * [vimeo:review] Improve config url extraction and extract original format (#20305) * [fox] Detect geo restriction and authentication errors (#20208) version 2019.03.09 Core * [extractor/common] Use compat_etree_Element + [compat] Introduce compat_etree_Element * [extractor/common] Fallback url to base URL for DASH formats * [extractor/common] Do not fail on invalid data while parsing F4M manifest in non fatal mode * [extractor/common] Return MPD manifest as format's url meta field (#20242) * [utils] Strip #HttpOnly_ prefix from cookies files (#20219) Extractors * [francetv:site] Relax video id regular expression (#20268) * [toutv] Detect invalid login error * [toutv] Fix authentication (#20261) + [urplay] Extract timestamp (#20235) + [openload] Add support for oload.space (#20246) * [facebook] Improve uploader extraction (#20250) * [bbc] Use compat_etree_Element * [crunchyroll] Use compat_etree_Element * [npo] Improve ISM extraction * [rai] Improve extraction (#20253) * [paramountnetwork] Fix mgid extraction (#20241) * [libsyn] Improve extraction (#20229) + [youtube] Add more invidious instances to URL regular expression (#20228) * [spankbang] Fix extraction (#20023) * [espn] Extend URL regular expression (#20013) * [sixplay] Handle videos with empty assets (#20016) + [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070) version 2019.03.01 Core + [downloader/external] Add support for rate limit and retries for wget * [downloader/external] Fix infinite retries for curl (#19303) Extractors * [npo] Fix extraction (#20084) * [francetv:site] Extend video id regex (#20029, #20071) + [periscope] Extract width and height (#20015) * [servus] Fix extraction (#19297) * [bbccouk] Make subtitles non fatal (#19651) * [metacafe] Fix family filter bypass (#19287) version 2019.02.18 Extractors * [tvp:website] Fix and improve extraction + [tvp] Detect unavailable videos * [tvp] Fix description extraction and make thumbnail optional + [linuxacademy] Add support for linuxacademy.com (#12207) * [bilibili] Update keys (#19233) * [udemy] Extend URL regular expressions (#14330, #15883) * [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) * [noovo] Fix extraction (#19230) * [rai] Relax URL regular expression (#19232) + [vshare] Pass Referer to download request (#19205, #19221) + [openload] Add support for oload.live (#19222) * [imgur] Use video id as title fallback (#18590) + [twitch] Add new source format detection approach (#19193) * [tvplayhome] Fix video id extraction (#19190) * [tvplayhome] Fix episode metadata extraction (#19190) * [rutube:embed] Fix extraction (#19163) + [rutube:embed] Add support private videos (#19163) + [soundcloud] Extract more metadata + [trunews] Add support for trunews.com (#19153) + [linkedin:learning] Extract chapter_number and chapter_id (#19162) version 2019.02.08 Core * [utils] Improve JSON-LD regular expression (#18058) * [YoutubeDL] Fallback to ie_key of matching extractor while making download archive id when no explicit ie_key is provided (#19022) Extractors + [malltv] Add support for mall.tv (#18058, #17856) + [spankbang:playlist] Add support for playlists (#19145) * [spankbang] Extend URL regular expression * [trutv] Fix extraction (#17336) * [toutv] Fix authentication (#16398, #18700) * [pornhub] Fix tags and categories extraction (#13720, #19135) * [pornhd] Fix formats extraction + [pornhd] Extract like count (#19123, #19125) * [radiocanada] Switch to the new media requests (#19115) + [teachable] Add support for courses.workitdaily.com (#18871) - [vporn] Remove extractor (#16276) + [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) + [drtuber] Extract duration (#19078) * [soundcloud] Fix paged playlists extraction, add support for albums and update client id * [soundcloud] Update client id * [drtv] Improve preference (#19079) + [openload] Add support for openload.pw and oload.pw (#18930) + [openload] Add support for oload.info (#19073) * [crackle] Authorize media detail request (#16931) version 2019.01.30.1 Core * [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) version 2019.01.30 Core * [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding subtitles (#19024, #19042) * [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) Extractors * [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) * [drtv] Improve extraction (#19039) + Add support for EncryptedUri videos + Extract more metadata * Fix subtitles extraction + [fox] Add support for locked videos using cookies (#19060) * [fox] Fix extraction for free videos (#19060) + [zattoo] Add support for tv.salt.ch (#19059) version 2019.01.27 Core + [extractor/common] Extract season in _json_ld * [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection (#681) Extractors * [vice] Fix extraction for locked videos (#16248) + [wakanim] Detect DRM protected videos + [wakanim] Add support for wakanim.tv (#14374) * [usatoday] Fix extraction for videos with custom brightcove partner id (#18990) * [drtv] Fix extraction (#18989) * [nhk] Extend URL regular expression (#18968) * [go] Fix Adobe Pass requests for Disney Now (#18901) + [openload] Add support for oload.club (#18969) version 2019.01.24 Core * [YoutubeDL] Fix negation for string operators in format selection (#18961) version 2019.01.23 Core * [utils] Fix urljoin for paths with non-http(s) schemes * [extractor/common] Improve jwplayer relative URL handling (#18892) + [YoutubeDL] Add negation support for string comparisons in format selection expressions (#18600, #18805) * [extractor/common] Improve HLS video-only format detection (#18923) Extractors * [crunchyroll] Extend URL regular expression (#18955) * [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, #17197, #18338 #18842, #18899) + [vrv] Add support for authentication (#14307) * [videomore:season] Fix extraction * [videomore] Improve extraction (#18908) + [tnaflix] Pass Referer in metadata request (#18925) * [radiocanada] Relax DRM check (#18608, #18609) * [vimeo] Fix video password verification for videos protected by Referer HTTP header + [hketv] Add support for hkedcity.net (#18696) + [streamango] Add support for fruithosts.net (#18710) + [instagram] Add support for tags (#18757) + [odnoklassniki] Detect paid videos (#18876) * [ted] Correct acodec for HTTP formats (#18923) * [cartoonnetwork] Fix extraction (#15664, #17224) * [vimeo] Fix extraction for password protected player URLs (#18889) version 2019.01.17 Extractors * [youtube] Extend JS player signature function name regular expressions (#18890, #18891, #18893) version 2019.01.16 Core + [test/helper] Add support for maxcount and count collection len checkers * [downloader/hls] Fix uplynk ad skipping (#18824) * [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) Extractors * [youtube] Skip unsupported adaptive stream type (#18804) + [youtube] Extract DASH formats from player response (#18804) * [funimation] Fix extraction (#14089) * [skylinewebcams] Fix extraction (#18853) + [curiositystream] Add support for non app URLs + [bitchute] Check formats (#18833) * [wistia] Extend URL regular expression (#18823) + [playplustv] Add support for playplus.com (#18789) version 2019.01.10 Core * [extractor/common] Use episode name as title in _json_ld + [extractor/common] Add support for movies in _json_ld * [postprocessor/ffmpeg] Embed subtitles with non-standard language codes (#18765) + [utils] Add language codes replaced in 1989 revision of ISO 639 to ISO639Utils (#18765) Extractors * [youtube] Extract live HLS URL from player response (#18799) + [outsidetv] Add support for outsidetv.com (#18774) * [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs + [fox] Add support National Geographic (#17985, #15333, #14698) + [playplustv] Add support for playplus.tv (#18789) * [globo] Set GLBID cookie manually (#17346) + [gaia] Add support for gaia.com (#14605) * [youporn] Fix title and description extraction (#18748) + [hungama] Add support for hungama.com (#17402, #18771) * [dtube] Fix extraction (#18741) * [tvnow] Fix and rework extractors and prepare for a switch to the new API (#17245, #18499) * [carambatv:page] Fix extraction (#18739) version 2019.01.02 Extractors * [discovery] Use geo verification headers (#17838) + [packtpub] Add support for subscription.packtpub.com (#18718) * [yourporn] Fix extraction (#18583) + [acast:channel] Add support for play.acast.com (#18587) + [extractors] Add missing age limits (#18621) + [rmcdecouverte] Add support for live stream * [rmcdecouverte] Bypass geo restriction * [rmcdecouverte] Update URL regular expression (#18595, 18697) * [manyvids] Fix extraction (#18604, #18614) * [bitchute] Fix extraction (#18567) version 2018.12.31 Extractors + [bbc] Add support for another embed pattern (#18643) + [npo:live] Add support for npostart.nl (#18644) * [beeg] Fix extraction (#18610, #18626) * [youtube] Unescape HTML for series (#18641) + [youtube] Extract more format metadata * [youtube] Detect DRM protected videos (#1774) * [youtube] Relax HTML5 player regular expressions (#18465, #18466) * [youtube] Extend HTML5 player regular expression (#17516) + [liveleak] Add support for another embed type and restore original format extraction + [crackle] Extract ISM and HTTP formats + [twitter] Pass Referer with card request (#18579) * [mediasite] Extend URL regular expression (#18558) + [lecturio] Add support for lecturio.de (#18562) + [discovery] Add support for Scripps Networks watch domains (#17947) version 2018.12.17 Extractors * [ard:beta] Improve geo restricted videos extraction * [ard:beta] Fix subtitles extraction * [ard:beta] Improve extraction robustness * [ard:beta] Relax URL regular expression (#18441) * [acast] Add support for embed.acast.com and play.acast.com (#18483) * [iprima] Relax URL regular expression (#18515, #18540) * [vrv] Fix initial state extraction (#18553) * [youtube] Fix mark watched (#18546) + [safari] Add support for learning.oreilly.com (#18510) * [youtube] Fix multifeed extraction (#18531) * [lecturio] Improve subtitles extraction (#18488) * [uol] Fix format URL extraction (#18480) + [ard:mediathek] Add support for classic.ardmediathek.de (#18473) version 2018.12.09 Core * [YoutubeDL] Keep session cookies in cookie file between runs * [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) Extractors + [teachable] Add support for teachable platform sites (#5451, #18150, #18272) + [aenetworks] Add support for historyvault.com (#18460) * [imgur] Improve gallery and album detection and extraction (#9133, #16577, #17223, #18404) * [iprima] Relax URL regular expression (#18453) * [hotstar] Fix video data extraction (#18386) * [ard:mediathek] Fix title and description extraction (#18349, #18371) * [xvideos] Switch to HTTPS (#18422, #18427) + [lecturio] Add support for lecturio.com (#18405) + [nrktv:series] Add support for extra materials * [nrktv:season,series] Fix extraction (#17159, #17258) * [nrktv] Relax URL regular expression (#18304, #18387) * [yourporn] Fix extraction (#18424, #18425) * [tbs] Fix info extraction (#18403) + [gamespot] Add support for review URLs version 2018.12.03 Core * [utils] Fix random_birthday to generate existing dates only (#18284) Extractors + [tiktok] Add support for tiktok.com (#18108, #18135) * [pornhub] Use actual URL host for requests (#18359) * [lynda] Fix authentication (#18158, #18217) * [gfycat] Update API endpoint (#18333, #18343) + [hotstar] Add support for alternative app state layout (#18320) * [azmedien] Fix extraction (#18334, #18336) + [vimeo] Add support for VHX (Vimeo OTT) (#14835) * [joj] Fix extraction (#18280, #18281) + [wistia] Add support for fast.wistia.com (#18287) version 2018.11.23 Core + [setup.py] Add more relevant classifiers Extractors * [mixcloud] Fallback to hardcoded decryption key (#18016) * [nbc:news] Fix article extraction (#16194) * [foxsports] Fix extraction (#17543) * [loc] Relax regular expression and improve formats extraction + [ciscolive] Add support for ciscolive.cisco.com (#17984) * [nzz] Relax kaltura regex (#18228) * [sixplay] Fix formats extraction * [bitchute] Improve title extraction * [kaltura] Limit requested MediaEntry fields + [americastestkitchen] Add support for zype embeds (#18225) + [pornhub] Add pornhub.net alias * [nova:embed] Fix extraction (#18222) version 2018.11.18 Extractors + [wwe] Extract subtitles + [wwe] Add support for playlists (#14781) + [wwe] Add support for wwe.com (#14781, #17450) * [vk] Detect geo restriction (#17767) * [openload] Use original host during extraction (#18211) * [atvat] Fix extraction (#18041) + [rte] Add support for new API endpoint (#18206) * [tnaflixnetwork:embed] Fix extraction (#18205) * [picarto] Use API and add token support (#16518) + [zype] Add support for player.zype.com (#18143) * [vivo] Fix extraction (#18139) * [ruutu] Update API endpoint (#18138) version 2018.11.07 Extractors + [youtube] Add another JS signature function name regex (#18091, #18093, #18094) * [facebook] Fix tahoe request (#17171) * [cliphunter] Fix extraction (#18083) + [youtube:playlist] Add support for invidio.us (#18077) * [zattoo] Arrange API hosts for derived extractors (#18035) + [youtube] Add fallback metadata extraction from videoDetails (#18052) version 2018.11.03 Core * [extractor/common] Ensure response handle is not prematurely closed before it can be read if it matches expected_status (#17195, #17846, #17447) Extractors * [laola1tv:embed] Set correct stream access URL scheme (#16341) + [ehftv] Add support for ehftv.com (#15408) * [azmedien] Adopt to major site redesign (#17745, #17746) + [twitcasting] Add support for twitcasting.tv (#17981) * [orf:tvthek] Fix extraction (#17737, #17956, #18024) + [openload] Add support for oload.fun (#18045) * [njpwworld] Fix authentication (#17427) + [linkedin:learning] Add support for linkedin.com/learning (#13545) * [theplatform] Improve error detection (#13222) * [cnbc] Simplify extraction (#14280, #17110) + [cbnc] Add support for new URL schema (#14193) * [aparat] Improve extraction and extract more metadata (#17445, #18008) * [aparat] Fix extraction version 2018.10.29 Core + [extractor/common] Add validation for JSON-LD URLs Extractors + [sportbox] Add support for matchtv.ru * [sportbox] Fix extraction (#17978) * [screencast] Fix extraction (#14590, #14617, #17990) + [openload] Add support for oload.icu + [ivi] Add support for ivi.tv * [crunchyroll] Improve extraction failsafeness (#17991) * [dailymail] Fix formats extraction (#17976) * [viewster] Reduce format requests * [cwtv] Handle API errors (#17905) + [rutube] Use geo verification headers (#17897) + [brightcove:legacy] Add fallbacks to brightcove:new (#13912) - [tv3] Remove extractor (#10461, #15339) * [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) + [openload] Add support for oload.cc (#17823) + [patreon] Extract post_file URL (#17792) * [patreon] Fix extraction (#14502, #10471) version 2018.10.05 Extractors * [pluralsight] Improve authentication (#17762) * [dailymotion] Fix extraction (#17699) * [crunchyroll] Switch to HTTPS for RpcApi (#17749) + [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) * [philharmoniedeparis] Fix extraction (#17705) + [jamendo] Add support for licensing.jamendo.com (#17724) + [openload] Add support for oload.cloud (#17710) * [pluralsight] Fix subtitles extraction (#17726, #17728) + [vimeo] Add another config regular expression (#17690) * [spike] Fix Paramount Network extraction (#17677) * [hotstar] Fix extraction (#14694, #14931, #17637) version 2018.09.26 Extractors * [pluralsight] Fix subtitles extraction (#17671) * [mediaset] Improve embed support (#17668) + [youtube] Add support for invidio.us (#17613) + [zattoo] Add support for more zattoo platform sites * [zattoo] Fix extraction (#17175, #17542) version 2018.09.18 Core + [extractor/common] Introduce channel meta fields Extractors * [adobepass] Don't pollute default headers dict * [udemy] Don't pollute default headers dict * [twitch] Don't pollute default headers dict * [youtube] Don't pollute default query dict (#17593) * [crunchyroll] Prefer hardsubless formats and formats in locale language * [vrv] Make format ids deterministic * [vimeo] Fix ondemand playlist extraction (#14591) + [pornhub] Extract upload date (#17574) + [porntube] Extract channel meta fields + [vimeo] Extract channel meta fields + [youtube] Extract channel meta fields (#9676, #12939) * [porntube] Fix extraction (#17541) * [asiancrush] Fix extraction (#15630) + [twitch:clips] Extend URL regular expression (#17559) + [vzaar] Add support for HLS * [tube8] Fix metadata extraction (#17520) * [eporner] Extract JSON-LD (#17519) version 2018.09.10 Core + [utils] Properly recognize AV1 codec (#17506) Extractors + [iprima] Add support for prima.iprima.cz (#17514) + [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414) * [nbc] Fix extraction of percent encoded URLs (#17374) version 2018.09.08 Extractors * [youtube] Fix extraction (#17457, #17464) + [pornhub:uservideos] Add support for new URLs (#17388) * [iprima] Confirm adult check (#17437) * [slideslive] Make check for video service name case-insensitive (#17429) * [radiojavan] Fix extraction (#17151) * [generic] Skip unsuccessful jwplayer extraction (#16735) version 2018.09.01 Core * [utils] Skip remote IP addresses non matching to source address' IP version when creating a connection (#13422, #17362) Extractors + [ard] Add support for one.ard.de (#17397) * [niconico] Fix extraction on python3 (#17393, #17407) * [ard] Extract f4m formats * [crunchyroll] Parse vilos media data (#17343) + [ard] Add support for Beta ARD Mediathek + [bandcamp] Extract more metadata (#13197) * [internazionale] Fix extraction of non-available-abroad videos (#17386) version 2018.08.28 Extractors + [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix) (#17361) * [bitchute] Fix extraction by pass custom User-Agent (#17360) * [webofstories:playlist] Fix extraction (#16914) + [tvplayhome] Add support for new tvplay URLs (#17344) + [generic] Allow relative src for videojs embeds (#17324) + [xfileshare] Add support for vidto.se (#17317) + [vidzi] Add support for vidzi.nu (#17316) + [nova:embed] Add support for media.cms.nova.cz (#17282) version 2018.08.22 Core * [utils] Use pure browser header for User-Agent (#17236) Extractors + [kinopoisk] Add support for kinopoisk.ru (#17283) + [yourporn] Add support for yourporn.sexy (#17298) + [go] Add support for disneynow.go.com (#16299, #17264) + [6play] Add support for play.rtl.hr (#17249) * [anvato] Fallback to generic API key for access-key-to-API-key lookup (#16788, #17254) * [lci] Fix extraction (#17274) * [bbccouk] Extend id URL regular expression (#17270) * [cwtv] Fix extraction (#17256) * [nova] Fix extraction (#17241) + [generic] Add support for expressen embeds * [raywenderlich] Adapt to site redesign (#17225) + [redbulltv] Add support redbull.com tv URLs (#17218) + [bitchute] Add support for bitchute.com (#14052) + [clyp] Add support for token protected media (#17184) * [imdb] Fix extension extraction (#17167) version 2018.08.04 Extractors * [funk:channel] Improve byChannelAlias extraction (#17142) * [twitch] Fix authentication (#17024, #17126) * [twitch:vod] Improve URL regular expression (#17135) * [watchbox] Fix extraction (#17107) * [pbs] Fix extraction (#17109) * [theplatform] Relax URL regular expression (#16181, #17097) + [viqeo] Add support for viqeo.tv (#17066) version 2018.07.29 Extractors * [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076) + [pornhub] Add support for subtitles (#16924, #17088) * [ceskatelevize] Use https for API call (#16997, #16999) * [dailymotion:playlist] Fix extraction (#16894) * [ted] Improve extraction * [ted] Fix extraction for videos without nativeDownloads (#16756, #17085) * [telecinco] Fix extraction (#17080) * [mitele] Reduce number of requests * [rai] Return non HTTP relinker URL intact (#17055) * [vk] Fix extraction for inline only videos (#16923) * [streamcloud] Fix extraction (#17054) * [facebook] Fix tahoe player extraction with authentication (#16655) + [puhutv] Add support for puhutv.com (#12712, #16010, #16269) version 2018.07.21 Core + [utils] Introduce url_or_none * [utils] Allow JSONP without function name (#17028) + [extractor/common] Extract DASH and MSS formats from SMIL manifests Extractors + [bbc] Add support for BBC Radio Play pages (#17022) * [iwara] Fix download URLs (#17026) * [vrtnu] Relax title extraction and extract JSON-LD (#17018) + [viu] Pass Referer and Origin headers and area id (#16992) + [vimeo] Add another config regular expression (#17013) + [facebook] Extract view count (#16942) * [dailymotion] Improve description extraction (#16984) * [slutload] Fix and improve extraction (#17001) * [mediaset] Fix extraction (#16977) + [theplatform] Add support for theplatform TLD customization (#16977) * [imgur] Relax URL regular expression (#16987) * [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262, #16959) version 2018.07.10 Core * [utils] Share JSON-LD regular expression * [downloader/dash] Improve error handling (#16927) Extractors + [nrktv] Add support for new season and serie URL schema + [nrktv] Add support for new episode URL schema (#16909) + [frontendmasters] Add support for frontendmasters.com (#3661, #16328) * [funk] Fix extraction (#16918) * [watchbox] Fix extraction (#16904) * [dplayit] Sort formats * [dplayit] Fix extraction (#16901) * [youtube] Improve login error handling (#13822) version 2018.07.04 Core * [extractor/common] Properly escape % in MPD templates (#16867) * [extractor/common] Use source URL as Referer for HTML5 entries (16849) * Prefer ffmpeg over avconv by default (#8622) Extractors * [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899) * [lynda] Simplify login and improve error capturing (#16891) + [go90] Add support for embed URLs (#16873) * [go90] Detect geo restriction error and pass geo verification headers (#16874) * [vlive] Fix live streams extraction (#16871) * [npo] Fix typo (#16872) + [mediaset] Add support for new videos and extract all formats (#16568) * [dctptv] Restore extraction based on REST API (#16850) * [svt] Improve extraction and add support for pages (#16802) * [porncom] Fix extraction (#16808) version 2018.06.25 Extractors * [joj] Relax URL regular expression (#16771) * [brightcove] Workaround sonyliv DRM protected videos (#16807) * [motherless] Fix extraction (#16786) * [itv] Make SOAP request non fatal and extract metadata from webpage (#16780) - [foxnews:insider] Remove extractor (#15810) + [foxnews] Add support for iframe embeds (#15810, #16711) version 2018.06.19 Core + [extractor/common] Introduce expected_status in _download_* methods for convenient accept of HTTP requests failed with non 2xx status codes + [compat] Introduce compat_integer_types Extractors * [peertube] Improve generic support (#16733) + [6play] Use geo verification headers * [rtbf] Fix extraction for python 3.2 * [vgtv] Improve HLS formats extraction + [vgtv] Add support for www.aftonbladet.se/tv URLs * [bbccouk] Use expected_status * [markiza] Expect 500 HTTP status code * [tvnow] Try all clear manifest URLs (#15361) version 2018.06.18 Core * [downloader/rtmp] Fix downloading in verbose mode (#16736) Extractors + [markiza] Add support for markiza.sk (#16750) * [wat] Try all supported adaptive URLs + [6play] Add support for rtlplay.be and extract hd usp formats + [rtbf] Add support for audio and live streams (#9638, #11923) + [rtbf] Extract HLS, DASH and all HTTP formats + [rtbf] Extract subtitles + [rtbf] Fixup specific HTTP URLs (#16101) + [expressen] Add support for expressen.se * [vidzi] Fix extraction (#16678) * [pbs] Improve extraction (#16623, #16684) * [bilibili] Restrict cid regular expression (#16638, #16734) version 2018.06.14 Core * [downloader/http] Fix retry on error when streaming to stdout (#16699) Extractors + [discoverynetworks] Add support for disco-api videos (#16724) + [dailymotion] Add support for password protected videos (#9789) + [abc:iview] Add support for livestreams (#12354) * [abc:iview] Fix extraction (#16704) + [crackle] Add support for sonycrackle.com (#16698) + [tvnet] Add support for tvnet.gov.vn (#15462) * [nrk] Update API hosts and try all previously known ones (#16690) * [wimp] Fix Youtube embeds extraction version 2018.06.11 Extractors * [npo] Extend URL regular expression and add support for npostart.nl (#16682) + [inc] Add support for another embed schema (#16666) * [tv4] Fix format extraction (#16650) + [nexx] Add support for free cdn (#16538) + [pbs] Add another cove id pattern (#15373) + [rbmaradio] Add support for 192k format (#16631) version 2018.06.04 Extractors + [camtube] Add support for camtube.co + [twitter:card] Extract guest token (#16609) + [chaturbate] Use geo verification headers + [bbc] Add support for bbcthree (#16612) * [youtube] Move metadata extraction after video availability check + [youtube] Extract track and artist + [safari] Add support for new URL schema (#16614) * [adn] Fix extraction version 2018.06.02 Core * [utils] Improve determine_ext Extractors + [facebook] Add support for tahoe player videos (#15441, #16554) * [cbc] Improve extraction (#16583, #16593) * [openload] Improve ext extraction (#16595) + [twitter:card] Add support for another endpoint (#16586) + [openload] Add support for oload.win and oload.download (#16592) * [audimedia] Fix extraction (#15309) + [francetv] Add support for sport.francetvinfo.fr (#15645) * [mlb] Improve extraction (#16587) - [nhl] Remove old extractors * [rbmaradio] Check formats availability (#16585) version 2018.05.30 Core * [downloader/rtmp] Generalize download messages and report time elapsed on finish * [downloader/rtmp] Gracefully handle live streams interrupted by user Extractors * [teamcoco] Fix extraction for full episodes (#16573) * [spiegel] Fix info extraction (#16538) + [apa] Add support for apa.at (#15041, #15672) + [bellmedia] Add support for bnnbloomberg.ca (#16560) + [9c9media] Extract MPD formats and subtitles * [cammodels] Use geo verification headers + [ufctv] Add support for authentication (#16542) + [cammodels] Add support for cammodels.com (#14499) * [utils] Fix style id extraction for namespaced id attribute in dfxp2srt (#16551) * [soundcloud] Detect format extension (#16549) * [cbc] Fix playlist title extraction (#16502) + [tumblr] Detect and report sensitive media (#13829) + [tumblr] Add support for authentication (#15133) version 2018.05.26 Core * [utils] Improve parse_age_limit Extractors * [audiomack] Stringify video id (#15310) * [izlesene] Fix extraction (#16233, #16271, #16407) + [indavideo] Add support for generic embeds (#11989) * [indavideo] Fix extraction (#11221) * [indavideo] Sign download URLs (#16174) + [peertube] Add support for PeerTube based sites (#16301, #16329) * [imgur] Fix extraction (#16537) + [hidive] Add support for authentication (#16534) + [nbc] Add support for stream.nbcsports.com (#13911) + [viewlift] Add support for hoichoi.tv (#16536) * [go90] Extract age limit and detect DRM protection(#10127) * [viewlift] fix extraction for snagfilms.com (#15766) * [globo] Improve extraction (#4189) * Add support for authentication * Simplify URL signing * Extract DASH and MSS formats * [leeco] Fix extraction (#16464) * [teamcoco] Add fallback for format extraction (#16484) * [teamcoco] Improve URL regular expression (#16484) * [imdb] Improve extraction (#4085, #14557) version 2018.05.18 Extractors * [vimeo:likes] Relax URL regular expression and fix single page likes extraction (#16475) * [pluralsight] Fix clip id extraction (#16460) + [mychannels] Add support for mychannels.com (#15334) - [moniker] Remove extractor (#15336) * [pbs] Fix embed data extraction (#16474) + [mtv] Add support for paramountnetwork.com and bellator.com (#15418) * [youtube] Fix hd720 format position * [dailymotion] Remove fragment part from m3u8 URLs (#8915) * [3sat] Improve extraction (#15350) * Extract all formats * Extract more format metadata * Improve format sorting * Use hls native downloader * Detect and bypass geo-restriction + [dtube] Add support for d.tube (#15201) * [options] Fix typo (#16450) * [youtube] Improve format filesize extraction (#16453) * [youtube] Make uploader extraction non fatal (#16444) * [youtube] Fix extraction for embed restricted live streams (#16433) * [nbc] Improve info extraction (#16440) * [twitch:clips] Fix extraction (#16429) * [redditr] Relax URL regular expression (#16426, #16427) * [mixcloud] Bypass throttling for HTTP formats (#12579, #16424) + [nick] Add support for nickjr.de (#13230) * [teamcoco] Fix extraction (#16374) version 2018.05.09 Core * [YoutubeDL] Ensure ext exists for automatic captions * Introduce --geo-bypass-ip-block Extractors + [udemy] Extract asset captions + [udemy] Extract stream URLs (#16372) + [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) + [cloudflarestream] Add support for cloudflarestream.com (#16375) * [watchbox] Fix extraction (#16356) * [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) + [itv:btcc] Add support for itv.com/btcc (#16139) * [tunein] Use live title for live streams (#16347) * [itv] Improve extraction (#16253) version 2018.05.01 Core * [downloader/fragment] Restart download if .ytdl file is corrupt (#16312) + [extractor/common] Extract interaction statistic + [utils] Add merge_dicts + [extractor/common] Add _download_json_handle Extractors * [kaltura] Improve iframe embeds detection (#16337) + [udemy] Extract outputs renditions (#16289, #16291, #16320, #16321, #16334, #16335) + [zattoo] Add support for zattoo.com and mobiltv.quickline.com (#14668, #14676) * [yandexmusic] Convert release_year to int * [udemy] Override _download_webpage_handle instead of _download_webpage * [xiami] Override _download_webpage_handle instead of _download_webpage * [yandexmusic] Override _download_webpage_handle instead of _download_webpage * [youtube] Correctly disable polymer on all requests (#16323, #16326) * [generic] Prefer enclosures over links in RSS feeds (#16189) + [redditr] Add support for old.reddit.com URLs (#16274) * [nrktv] Update API host (#16324) + [imdb] Extract all formats (#16249) + [vimeo] Extract JSON-LD (#16295) * [funk:channel] Improve extraction (#16285) version 2018.04.25 Core * [utils] Fix match_str for boolean meta fields + [Makefile] Add support for pandoc 2 and disable smart extension (#16251) * [YoutubeDL] Fix typo in media extension compatibility checker (#16215) Extractors + [openload] Recognize IPv6 stream URLs (#16136, #16137, #16205, #16246, #16250) + [twitch] Extract is_live according to status (#16259) * [pornflip] Relax URL regular expression (#16258) - [etonline] Remove extractor (#16256) * [breakcom] Fix extraction (#16254) + [youtube] Add ability to authenticate with cookies * [youtube:feed] Implement lazy playlist extraction (#10184) + [svt] Add support for TV channel live streams (#15279, #15809) * [ccma] Fix video extraction (#15931) * [rentv] Fix extraction (#15227) + [nick] Add support for nickjr.nl (#16230) * [extremetube] Fix metadata extraction + [keezmovies] Add support for generic embeds (#16134, #16154) * [nexx] Extract new azure URLs (#16223) * [cbssports] Fix extraction (#16217) * [kaltura] Improve embeds detection (#16201) * [instagram:user] Fix extraction (#16119) * [cbs] Skip DRM asset types (#16104) version 2018.04.16 Extractors * [smotri:broadcast] Fix extraction (#16180) + [picarto] Add support for picarto.tv (#6205, #12514, #15276, #15551) * [vine:user] Fix extraction (#15514, #16190) * [pornhub] Relax URL regular expression (#16165) * [cbc:watch] Re-acquire device token when expired (#16160) + [fxnetworks] Add support for https theplatform URLs (#16125, #16157) + [instagram:user] Add request signing (#16119) + [twitch] Add support for mobile URLs (#16146) version 2018.04.09 Core * [YoutubeDL] Do not save/restore console title while simulate (#16103) * [extractor/common] Relax JSON-LD context check (#16006) Extractors + [generic] Add support for tube8 embeds + [generic] Add support for share-videos.se embeds (#16089, #16115) * [odnoklassniki] Extend URL regular expression (#16081) * [steam] Bypass mature content check (#16113) + [acast] Extract more metadata * [acast] Fix extraction (#16118) * [instagram:user] Fix extraction (#16119) * [drtuber] Fix title extraction (#16107, #16108) * [liveleak] Extend URL regular expression (#16117) + [openload] Add support for oload.xyz * [openload] Relax stream URL regular expression * [openload] Fix extraction (#16099) + [svtplay:series] Add support for season URLs + [svtplay:series] Add support for series (#11130, #16059) version 2018.04.03 Extractors + [tvnow] Add support for shows (#15837) * [dramafever] Fix authentication (#16067) * [afreecatv] Use partial view only when necessary (#14450) + [afreecatv] Add support for authentication (#14450) + [nationalgeographic] Add support for new URL schema (#16001, #16054) * [xvideos] Fix thumbnail extraction (#15978, #15979) * [medialaan] Fix vod id (#16038) + [openload] Add support for oload.site (#16039) * [naver] Fix extraction (#16029) * [dramafever] Partially switch to API v5 (#16026) * [abc:iview] Unescape title and series meta fields (#15994) * [videa] Extend URL regular expression (#16003) version 2018.03.26.1 Core + [downloader/external] Add elapsed time to progress hook (#10876) * [downloader/external,fragment] Fix download finalization when writing file to stdout (#10809, #10876, #15799) Extractors * [vrv] Fix extraction on python2 (#15928) * [afreecatv] Update referrer (#15947) + [24video] Add support for 24video.sexy (#15973) * [crackle] Bypass geo restriction * [crackle] Fix extraction (#15969) + [lenta] Add support for lenta.ru (#15953) + [instagram:user] Add pagination (#15934) * [youku] Update ccode (#15939) * [libsyn] Adapt to new page structure version 2018.03.20 Core * [extractor/common] Improve thumbnail extraction for HTML5 entries * Generalize XML manifest processing code and improve XSPF parsing + [extractor/common] Add _download_xml_handle + [extractor/common] Add support for relative URIs in _parse_xspf (#15794) Extractors + [7plus] Extract series metadata (#15862, #15906) * [9now] Bypass geo restriction (#15920) * [cbs] Skip unavailable assets (#13490, #13506, #15776) + [canalc2] Add support for HTML5 videos (#15916, #15919) + [ceskatelevize] Add support for iframe embeds (#15918) + [prosiebensat1] Add support for galileo.tv (#15894) + [generic] Add support for xfileshare embeds (#15879) * [bilibili] Switch to v2 playurl API * [bilibili] Fix and improve extraction (#15048, #15430, #15622, #15863) * [heise] Improve extraction (#15496, #15784, #15026) * [instagram] Fix user videos extraction (#15858) version 2018.03.14 Extractors * [soundcloud] Update client id (#15866) + [tennistv] Add support for tennistv.com + [line] Add support for tv.line.me (#9427) * [xnxx] Fix extraction (#15817) * [njpwworld] Fix authentication (#15815) version 2018.03.10 Core * [downloader/hls] Skip uplynk ad fragments (#15748) Extractors * [pornhub] Don't override session cookies (#15697) + [raywenderlich] Add support for videos.raywenderlich.com (#15251) * [funk] Fix extraction and rework extractors (#15792) * [nexx] Restore reverse engineered approach + [heise] Add support for kaltura embeds (#14961, #15728) + [tvnow] Extract series metadata (#15774) * [ruutu] Continue formats extraction on NOT-USED URLs (#15775) * [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) * [vimeo] Modernize login code and improve error messaging * [archiveorg] Fix extraction (#15770, #15772) + [hidive] Add support for hidive.com (#15494) * [afreecatv] Detect deleted videos * [afreecatv] Fix extraction (#15755) * [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) + [vidzi] Add support for vidzi.si (#15751) * [npo] Fix typo version 2018.03.03 Core + [utils] Add parse_resolution Revert respect --prefer-insecure while updating Extractors + [yapfiles] Add support for yapfiles.ru (#15726, #11085) * [spankbang] Fix formats extraction (#15727) * [adn] Fix extraction (#15716) + [toggle] Extract DASH and ISM formats (#15721) + [nickelodeon] Add support for nickelodeon.com.tr (#15706) * [npo] Validate and filter format URLs (#15709) version 2018.02.26 Extractors * [udemy] Use custom User-Agent (#15571) version 2018.02.25 Core * [postprocessor/embedthumbnail] Skip embedding when there aren't any thumbnails (#12573) * [extractor/common] Improve jwplayer subtitles extraction (#15695) Extractors + [vidlii] Add support for vidlii.com (#14472, #14512, #14779) + [streamango] Capture and output error messages * [streamango] Fix extraction (#14160, #14256) + [telequebec] Add support for emissions (#14649, #14655) + [telequebec:live] Add support for live streams (#15688) + [mailru:music] Add support for mail.ru/music (#15618) * [aenetworks] Switch to akamai HLS formats (#15612) * [ytsearch] Fix flat title extraction (#11260, #15681) version 2018.02.22 Core + [utils] Fixup some common URL typos in sanitize_url (#15649) * Respect --prefer-insecure while updating (#15497) Extractors * [vidio] Fix HLS URL extraction (#15675) + [nexx] Add support for arc.nexx.cloud URLs * [nexx] Switch to arc API (#15652) * [redtube] Fix duration extraction (#15659) + [sonyliv] Respect referrer (#15648) + [brightcove:new] Use referrer for formats' HTTP headers + [cbc] Add support for olympics.cbc.ca (#15535) + [fusion] Add support for fusion.tv (#15628) * [npo] Improve quality metadata extraction * [npo] Relax URL regular expression (#14987, #14994) + [npo] Capture and output error message + [pornhub] Add support for channels (#15613) * [youtube] Handle shared URLs with generic extractor (#14303) version 2018.02.11 Core + [YoutubeDL] Add support for filesize_approx in format selector (#15550) Extractors + [francetv] Add support for live streams (#13689) + [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, #15012) * [francetv] Separate main extractor and rework others to delegate to it * [francetv] Improve manifest URL signing (#15536) + [francetv] Sign m3u8 manifest URLs (#15565) + [veoh] Add support for embed URLs (#15561) * [afreecatv] Fix extraction (#15556) * [periscope] Use accessVideoPublic endpoint (#15554) * [discovery] Fix auth request (#15542) + [6play] Extract subtitles (#15541) * [newgrounds] Fix metadata extraction (#15531) + [nbc] Add support for stream.nbcolympics.com (#10295) * [dvtv] Fix live streams extraction (#15442) version 2018.02.08 Extractors + [myvi] Extend URL regular expression + [myvi:embed] Add support for myvi.tv embeds (#15521) + [prosiebensat1] Extend URL regular expression (#15520) * [pokemon] Relax URL regular expression and extend title extraction (#15518) + [gameinformer] Use geo verification headers * [la7] Fix extraction (#15501, #15502) * [gameinformer] Fix brightcove id extraction (#15416) + [afreecatv] Pass referrer to video info request (#15507) + [telebruxelles] Add support for live streams * [telebruxelles] Relax URL regular expression * [telebruxelles] Fix extraction (#15504) * [extractor/common] Respect secure schemes in _extract_wowza_formats version 2018.02.04 Core * [downloader/http] Randomize HTTP chunk size + [downloader/http] Add ability to pass downloader options via info dict * [downloader/http] Fix 302 infinite loops by not reusing requests + Document http_chunk_size Extractors + [brightcove] Pass embed page URL as referrer (#15486) + [youtube] Enforce using chunked HTTP downloading for DASH formats version 2018.02.03 Core + Introduce --http-chunk-size for chunk-based HTTP downloading + Add support for IronPython * [downloader/ism] Fix Python 3.2 support Extractors * [redbulltv] Fix extraction (#15481) * [redtube] Fix metadata extraction (#15472) * [pladform] Respect platform id and extract HLS formats (#15468) - [rtlnl] Remove progressive formats (#15459) * [6play] Do no modify asset URLs with a token (#15248) * [nationalgeographic] Relax URL regular expression * [dplay] Relax URL regular expression (#15458) * [cbsinteractive] Fix data extraction (#15451) + [amcnetworks] Add support for sundancetv.com (#9260) version 2018.01.27 Core * [extractor/common] Improve _json_ld for articles * Switch codebase to use compat_b64decode + [compat] Add compat_b64decode Extractors + [seznamzpravy] Add support for seznam.cz and seznamzpravy.cz (#14102, #14616) * [dplay] Bypass geo restriction + [dplay] Add support for disco-api videos (#15396) * [youtube] Extract precise error messages (#15284) * [teachertube] Capture and output error message * [teachertube] Fix and relax thumbnail extraction (#15403) + [prosiebensat1] Add another clip id regular expression (#15378) * [tbs] Update tokenizer url (#15395) * [mixcloud] Use compat_b64decode (#15394) - [thesixtyone] Remove extractor (#15341) version 2018.01.21 Core * [extractor/common] Improve jwplayer DASH formats extraction (#9242, #15187) * [utils] Improve scientific notation handling in js_to_json (#14789) Extractors + [southparkdk] Add support for southparkstudios.nu + [southpark] Add support for collections (#14803) * [franceinter] Fix upload date extraction (#14996) + [rtvs] Add support for rtvs.sk (#9242, #15187) * [restudy] Fix extraction and extend URL regular expression (#15347) * [youtube:live] Improve live detection (#15365) + [springboardplatform] Add support for springboardplatform.com * [prosiebensat1] Add another clip id regular expression (#15290) - [ringtv] Remove extractor (#15345) version 2018.01.18 Extractors * [soundcloud] Update client id (#15306) - [kamcord] Remove extractor (#15322) + [spiegel] Add support for nexx videos (#15285) * [twitch] Fix authentication and error capture (#14090, #15264) * [vk] Detect more errors due to copyright complaints (#15259) version 2018.01.14 Extractors * [youtube] Fix live streams extraction (#15202) * [wdr] Bypass geo restriction * [wdr] Rework extractors (#14598) + [wdr] Add support for wdrmaus.de/elefantenseite (#14598) + [gamestar] Add support for gamepro.de (#3384) * [viafree] Skip rtmp formats (#15232) + [pandoratv] Add support for mobile URLs (#12441) + [pandoratv] Add support for new URL format (#15131) + [ximalaya] Add support for ximalaya.com (#14687) + [digg] Add support for digg.com (#15214) * [limelight] Tolerate empty pc formats (#15150, #15151, #15207) * [ndr:embed:base] Make separate formats extraction non fatal (#15203) + [weibo] Add extractor (#15079) + [ok] Add support for live streams * [canalplus] Fix extraction (#15072) * [bilibili] Fix extraction (#15188) version 2018.01.07 Core * [utils] Fix youtube-dl under PyPy3 on Windows * [YoutubeDL] Output python implementation in debug header Extractors + [jwplatform] Add support for multiple embeds (#15192) * [mitele] Fix extraction (#15186) + [motherless] Add support for groups (#15124) * [lynda] Relax URL regular expression (#15185) * [soundcloud] Fallback to avatar picture for thumbnail (#12878) * [youku] Fix list extraction (#15135) * [openload] Fix extraction (#15166) * [lynda] Skip invalid subtitles (#15159) * [twitch] Pass video id to url_result when extracting playlist (#15139) * [rtve.es:alacarta] Fix extraction of some new URLs * [acast] Fix extraction (#15147) version 2017.12.31 Core + [extractor/common] Add container meta field for formats extracted in _parse_mpd_formats (#13616) + [downloader/hls] Use HTTP headers for key request * [common] Use AACL as the default fourcc when AudioTag is 255 * [extractor/common] Fix extraction of DASH formats with the same representation id (#15111) Extractors + [slutload] Add support for mobile URLs (#14806) * [abc:iview] Bypass geo restriction * [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, #15035, #15057, #15061, #15071, #15095, #15106) * [openload] Fix extraction (#15118) - [sandia] Remove extractor - [collegerama] Remove extractor + [mediasite] Add support for sites based on Mediasite Video Platform (#5428, #11185, #14343) + [ufctv] Add support for ufc.tv (#14520) * [pluralsight] Fix missing first line of subtitles (#11118) * [openload] Fallback on f-page extraction (#14665, #14879) * [vimeo] Improve password protected videos extraction (#15114) * [aws] Fix canonical/signed headers generation on python 2 (#15102) version 2017.12.28 Extractors + [internazionale] Add support for internazionale.it (#14973) * [playtvak] Relax video regular expression and make description optional (#15037) + [filmweb] Add support for filmweb.no (#8773, #10368) + [23video] Add support for 23video.com + [espn] Add support for fivethirtyeight.com (#6864) + [umg:de] Add support for universal-music.de (#11582, #11584) + [espn] Add support for espnfc and extract more formats (#8053) * [youku] Update ccode (#14880) + [openload] Add support for oload.stream (#15070) * [youku] Fix list extraction (#15065) version 2017.12.23 Core * [extractor/common] Move X-Forwarded-For setup code into _request_webpage + [YoutubeDL] Add support for playlist_uploader and playlist_uploader_id in output template (#11427, #15018) + [extractor/common] Introduce uploader, uploader_id and uploader_url meta fields for playlists (#11427, #15018) * [downloader/fragment] Encode filename of fragment being removed (#15020) + [utils] Add another date format pattern (#14999) Extractors + [kaltura] Add another embed pattern for entry_id + [7plus] Add support for 7plus.com.au (#15043) * [animeondemand] Relax login error regular expression + [shahid] Add support for show pages (#7401) + [youtube] Extract uploader, uploader_id and uploader_url for playlists (#11427, #15018) * [afreecatv] Improve format extraction (#15019) + [cspan] Add support for audio only pages and catch page errors (#14995) + [mailru] Add support for embed URLs (#14904) * [crunchyroll] Future-proof XML element checks (#15013) * [cbslocal] Fix timestamp extraction (#14999, #15000) * [discoverygo] Correct TTML subtitle extension * [vk] Make view count optional (#14979) * [disney] Skip Apple FairPlay formats (#14982) * [voot] Fix format extraction (#14758) version 2017.12.14 Core * [postprocessor/xattr] Clarify NO_SPACE message (#14970) * [downloader/http] Return actual download result from real_download (#14971) Extractors + [itv] Extract more subtitles and duration * [itv] Improve extraction (#14944) + [byutv] Add support for geo restricted videos * [byutv] Fix extraction (#14966, #14967) + [bbccouk] Fix extraction for 320k HLS streams + [toutv] Add support for special video URLs (#14179) * [discovery] Fix free videos extraction (#14157, #14954) * [tvnow] Fix extraction (#7831) + [nickelodeon:br] Add support for nickelodeon brazil websites (#14893) * [nick] Improve extraction (#14876) * [tbs] Fix extraction (#13658) version 2017.12.10 Core + [utils] Add sami mimetype to mimetype2ext Extractors * [culturebox] Improve video id extraction (#14947) * [twitter] Improve extraction (#14197) + [udemy] Extract more HLS formats * [udemy] Improve course id extraction (#14938) + [stretchinternet] Add support for portal.stretchinternet.com (#14576) * [ellentube] Fix extraction (#14407, #14570) + [raiplay:playlist] Add support for playlists (#14563) * [sonyliv] Bypass geo restriction * [sonyliv] Extract higher quality formats (#14922) * [fox] Extract subtitles + [fox] Add support for Adobe Pass authentication (#14205, #14489) - [dailymotion:cloud] Remove extractor (#6794) * [xhamster] Fix thumbnail extraction (#14780) + [xhamster] Add support for mobile URLs (#14780) * [generic] Don't pass video id as mpd id while extracting DASH (#14902) * [ard] Skip invalid stream URLs (#14906) * [porncom] Fix metadata extraction (#14911) * [pluralsight] Detect agreement request (#14913) * [toutv] Fix login (#14614) version 2017.12.02 Core + [downloader/fragment] Commit part file after each fragment + [extractor/common] Add durations for DASH fragments with bare SegmentURLs + [extractor/common] Add support for DASH manifests with SegmentLists with bare SegmentURLs (#14844) + [utils] Add hvc1 codec code to parse_codecs Extractors * [xhamster] Fix extraction (#14884) * [youku] Update ccode (#14872) * [mnet] Fix format extraction (#14883) + [xiami] Add Referer header to API request * [mtv] Correct scc extension in extracted subtitles (#13730) * [vvvvid] Fix extraction for kenc videos (#13406) + [br] Add support for BR Mediathek videos (#14560, #14788) + [daisuki] Add support for motto.daisuki.com (#14681) * [odnoklassniki] Fix API metadata request (#14862) * [itv] Fix HLS formats extraction + [pbs] Add another media id regular expression version 2017.11.26 Core * [extractor/common] Use final URL when dumping request (#14769) Extractors * [fczenit] Fix extraction - [firstpost] Remove extractor * [freespeech] Fix extraction * [nexx] Extract more formats + [openload] Add support for openload.link (#14763) * [empflix] Relax URL regular expression * [empflix] Fix extraction * [tnaflix] Don't modify download URLs (#14811) - [gamersyde] Remove extractor * [francetv:generationwhat] Fix extraction + [massengeschmacktv] Add support for Massengeschmack TV * [fox9] Fix extraction * [faz] Fix extraction and add support for Perform Group embeds (#14714) + [performgroup] Add support for performgroup.com + [jwplatform] Add support for iframes (#14828) * [culturebox] Fix extraction (#14827) * [youku] Fix extraction; update ccode (#14815) * [livestream] Make SMIL extraction non fatal (#14792) + [drtuber] Add support for mobile URLs (#14772) + [spankbang] Add support for mobile URLs (#14771) * [instagram] Fix description, timestamp and counters extraction (#14755) version 2017.11.15 Core * [common] Skip Apple FairPlay m3u8 manifests (#14741) * [YoutubeDL] Fix playlist range optimization for --playlist-items (#14740) Extractors * [vshare] Capture and output error message * [vshare] Fix extraction (#14473) * [crunchyroll] Extract old RTMP formats * [tva] Fix extraction (#14736) * [gamespot] Lower preference of HTTP formats (#14652) * [instagram:user] Fix extraction (#14699) * [ccma] Fix typo (#14730) - Remove sensitive data from logging in messages * [instagram:user] Fix extraction (#14699) + [gamespot] Add support for article URLs (#14652) * [gamespot] Skip Brightcove Once HTTP formats (#14652) * [cartoonnetwork] Update tokenizer_src (#14666) + [wsj] Recognize another URL pattern (#14704) * [pandatv] Update API URL and sign format URLs (#14693) * [crunchyroll] Use old login method (#11572) version 2017.11.06 Core + [extractor/common] Add protocol for f4m formats * [f4m] Prefer baseURL for relative URLs (#14660) * [extractor/common] Respect URL query in _extract_wowza_formats (14645) Extractors + [hotstar:playlist] Add support for playlists (#12465) * [hotstar] Bypass geo restriction (#14672) - [22tracks] Remove extractor (#11024, #14628) + [skysport] Sdd support ooyala videos protected with embed_token (#14641) * [gamespot] Extract formats referenced with new data fields (#14652) * [spankbang] Detect unavailable videos (#14644) version 2017.10.29 Core * [extractor/common] Prefix format id for audio only HLS formats + [utils] Add support for zero years and months in parse_duration Extractors * [egghead] Fix extraction (#14388) + [fxnetworks] Extract series metadata (#14603) + [younow] Add support for younow.com (#9255, #9432, #12436) * [dctptv] Fix extraction (#14599) * [youtube] Restrict embed regular expression (#14600) * [vimeo] Restrict iframe embed regular expression (#14600) * [soundgasm] Improve extraction (#14588) - [myvideo] Remove extractor (#8557) + [nbc] Add support for classic-tv videos (#14575) + [vrtnu] Add support for cookies authentication and simplify (#11873) + [canvas] Add support for vrt.be/vrtnu (#11873) * [twitch:clips] Fix title extraction (#14566) + [ndtv] Add support for sub-sites (#14534) * [dramafever] Fix login error message extraction + [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, ro, hu) (#14553) version 2017.10.20 Core * [downloader/fragment] Report warning instead of error on inconsistent download state * [downloader/hls] Fix total fragments count when ad fragments exist Extractors * [parliamentliveuk] Fix extraction (#14524) * [soundcloud] Update client id (#14546) + [servus] Add support for servus.com (#14362) + [unity] Add support for unity3d.com (#14528) * [youtube] Replace youtube redirect URLs in description (#14517) * [pbs] Restrict direct video URL regular expression (#14519) * [drtv] Respect preference for direct HTTP formats (#14509) + [eporner] Add support for embed URLs (#14507) * [arte] Capture and output error message * [niconico] Improve uploader metadata extraction robustness (#14135) version 2017.10.15.1 Core * [downloader/hls] Ignore anvato ad fragments (#14496) * [downloader/fragment] Output ad fragment count Extractors * [scrippsnetworks:watch] Bypass geo restriction + [anvato] Add ability to bypass geo restriction * [redditr] Fix extraction for URLs with query (#14495) version 2017.10.15 Core + [common] Add support for jwplayer youtube embeds Extractors * [scrippsnetworks:watch] Fix extraction (#14389) * [anvato] Process master m3u8 manifests * [youtube] Fix relative URLs in description * [spike] Bypass geo restriction + [howstuffworks] Add support for more domains * [infoq] Fix http format downloading + [rtlnl] Add support for another type of embeds + [onionstudios] Add support for bulbs-video embeds * [udn] Fix extraction * [shahid] Fix extraction (#14448) * [kaltura] Ignore Widevine encrypted video (.wvm) (#14471) * [vh1] Fix extraction (#9613) version 2017.10.12 Core * [YoutubeDL] Improve _default_format_spec (#14461) Extractors * [steam] Fix extraction (#14067) + [funk] Add support for funk.net (#14464) + [nexx] Add support for shortcuts and relax domain id extraction + [voxmedia] Add support for recode.net (#14173) + [once] Add support for vmap URLs + [generic] Add support for channel9 embeds (#14469) * [tva] Fix extraction (#14328) + [tubitv] Add support for new URL format (#14460) - [afreecatv:global] Remove extractor - [youtube:shared] Removed extractor (#14420) + [slideslive] Add support for slideslive.com (#2680) + [facebook] Support thumbnails (#14416) * [vvvvid] Fix episode number extraction (#14456) * [hrti:playlist] Relax URL regular expression * [wdr] Relax media link regular expression (#14447) * [hrti] Relax URL regular expression (#14443) * [fox] Delegate extraction to uplynk:preplay (#14147) + [youtube] Add support for hooktube.com (#14437) version 2017.10.07 Core * [YoutubeDL] Ignore duplicates in --playlist-items * [YoutubeDL] Fix out of range --playlist-items for iterable playlists and reduce code duplication (#14425) + [utils] Use cache in OnDemandPagedList by default * [postprocessor/ffmpeg] Convert to opus using libopus (#14381) Extractors * [reddit] Sort formats (#14430) * [lnkgo] Relax URL regular expression (#14423) * [pornflip] Extend URL regular expression (#14405, #14406) + [xtube] Add support for embed URLs (#14417) + [xvideos] Add support for embed URLs and improve extraction (#14409) * [beeg] Fix extraction (#14403) * [tvn24] Relax URL regular expression (#14395) * [nbc] Fix extraction (#13651, #13715, #14137, #14198, #14312, #14314, #14378, #14392, #14414, #14419, #14431) + [ketnet] Add support for videos without direct sources (#14377) * [canvas] Generalize mediazone.vrt.be extractor and rework canvas and een + [afreecatv] Add support for adult videos (#14376) version 2017.10.01 Core * [YoutubeDL] Document youtube_include_dash_manifest Extractors + [tvp] Add support for new URL schema (#14368) + [generic] Add support for single format Video.js embeds (#14371) * [yahoo] Bypass geo restriction for brightcove (#14210) * [yahoo] Use extracted brightcove account id (#14210) * [rtve:alacarta] Fix extraction (#14290) + [yahoo] Add support for custom brightcove embeds (#14210) + [generic] Add support for Video.js embeds + [gfycat] Add support for /gifs/detail URLs (#14322) * [generic] Fix infinite recursion for twitter:player URLs (#14339) * [xhamsterembed] Fix extraction (#14308) version 2017.09.24 Core + [options] Accept lrc as a subtitle conversion target format (#14292) * [utils] Fix handling raw TTML subtitles (#14191) Extractors * [24video] Fix timestamp extraction and make non fatal (#14295) + [24video] Add support for 24video.adult (#14295) + [kakao] Add support for tv.kakao.com (#12298, #14007) + [twitter] Add support for URLs without user id (#14270) + [americastestkitchen] Add support for americastestkitchen.com (#10764, #13996) * [generic] Fix support for multiple HTML5 videos on one page (#14080) * [mixcloud] Fix extraction (#14088, #14132) + [lynda] Add support for educourse.ga (#14286) * [beeg] Fix extraction (#14275) * [nbcsports:vplayer] Correct theplatform URL (#13873) * [twitter] Fix duration extraction (#14141) * [tvplay] Bypass geo restriction + [heise] Add support for YouTube embeds (#14109) + [popcorntv] Add support for popcorntv.it (#5914, #14211) * [viki] Update app data (#14181) * [morningstar] Relax URL regular expression (#14222) * [openload] Fix extraction (#14225, #14257) * [noovo] Fix extraction (#14214) * [dailymotion:playlist] Relax URL regular expression (#14219) + [twitch] Add support for go.twitch.tv URLs (#14215) * [vgtv] Relax URL regular expression (#14223) version 2017.09.15 Core * [downloader/fragment] Restart inconsistent incomplete fragment downloads (#13731) * [YoutubeDL] Download raw subtitles files (#12909, #14191) Extractors * [condenast] Fix extraction (#14196, #14207) + [orf] Add support for f4m stories * [tv4] Relax URL regular expression (#14206) * [animeondemand] Bypass geo restriction + [animeondemand] Add support for flash videos (#9944) version 2017.09.11 Extractors * [rutube:playlist] Fix suitable (#14166) version 2017.09.10 Core + [utils] Introduce bool_or_none * [YoutubeDL] Ensure dir existence for each requested format (#14116) Extractors * [fox] Fix extraction (#14147) * [rutube] Use bool_or_none * [rutube] Rework and generalize playlist extractors (#13565) + [rutube:playlist] Add support for playlists (#13534, #13565) + [radiocanada] Add fallback for title extraction (#14145) * [vk] Use dedicated YouTube embeds extraction routine * [vice] Use dedicated YouTube embeds extraction routine * [cracked] Use dedicated YouTube embeds extraction routine * [chilloutzone] Use dedicated YouTube embeds extraction routine * [abcnews] Use dedicated YouTube embeds extraction routine * [youtube] Separate methods for embeds extraction * [redtube] Fix formats extraction (#14122) * [arte] Relax unavailability check (#14112) + [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) * [vidme:user] Relax URL regular expression (#14054) * [bpb] Fix extraction (#14043, #14086) * [soundcloud] Fix download URL with private tracks (#14093) * [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) * [viidea] Capture and output lecture error message (#14099) * [radiocanada] Skip unsupported platforms (#14100) version 2017.09.02 Extractors * [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, #14077, #14079, #14082, #14083, #14094, #14095, #14096) * [youtube] Fix upload date extraction (#14065) + [charlierose] Add support for episodes (#14062) + [bbccouk] Add support for w-prefixed ids (#14056) * [googledrive] Extend URL regular expression (#9785) + [googledrive] Add support for source format (#14046) * [pornhd] Fix extraction (#14005) version 2017.08.27.1 Extractors * [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) version 2017.08.27 Core + [extractor/common] Extract height and format id for HTML5 videos (#14034) * [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, #8625, #9483) * Simplify code and split into separate routines to facilitate maintaining * Make retry mechanism work on errors during actual download not only during connection establishment phase * Retry on ECONNRESET and ETIMEDOUT during reading data from network * Retry on content too short * Show error description on retry Extractors * [generic] Lower preference for extraction from LD-JSON * [rai] Fix audio formats extraction (#14024) * [youtube] Fix controversy videos extraction (#14027, #14029) * [mixcloud] Fix extraction (#14015, #14020) version 2017.08.23 Core + [extractor/common] Introduce _parse_xml * [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries non fatal (#13970) * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) Extractors * [cbc:watch] Bypass geo restriction (#13993) * [toutv] Relax DRM check (#13994) + [googledrive] Add support for subtitles (#13619, #13638) * [pornhub] Relax uploader regular expression (#13906, #13975) * [bandcamp:album] Extract track titles (#13962) + [bbccouk] Add support for events URLs (#13893) + [liveleak] Support multi-video pages (#6542) + [liveleak] Support another liveleak embedding pattern (#13336) * [cda] Fix extraction (#13935) + [laola1tv] Add support for tv.ittf.com (#13965) * [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) version 2017.08.18 Core * [YoutubeDL] Sanitize byte string format URLs (#13951) + [extractor/common] Add support for float durations in _parse_mpd_formats (#13919) Extractors * [arte] Detect unavailable videos (#13945) * [generic] Convert redirect URLs to unicode strings (#13951) * [udemy] Fix paid course detection (#13943) * [pluralsight] Use RPC API for course extraction (#13937) + [clippit] Add support for clippituser.tv + [qqmusic] Support new URL schemes (#13805) * [periscope] Renew HLS extraction (#13917) * [mixcloud] Extract decrypt key version 2017.08.13 Core * [YoutubeDL] Make sure format id is not empty * [extractor/common] Make _family_friendly_search optional * [extractor/common] Respect source's type attribute for HTML5 media (#13892) Extractors * [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) + [fourtube] Add support pornerbros.com (#6022) + [fourtube] Add support porntube.com (#7859, #13901) + [fourtube] Add support fux.com * [limelight] Improve embeds detection (#13895) + [reddit] Add support for v.redd.it and reddit.com (#13847) * [aparat] Extract all formats (#13887) * [mixcloud] Fix play info decryption (#13885) + [generic] Add support for vzaar embeds (#13876) version 2017.08.09 Core * [utils] Skip missing params in cli_bool_option (#13865) Extractors * [xxxymovies] Fix title extraction (#13868) + [nick] Add support for nick.com.pl (#13860) * [mixcloud] Fix play info decryption (#13867) * [20min] Fix embeds extraction (#13852) * [dplayit] Fix extraction (#13851) + [niconico] Support videos with multiple formats (#13522) + [niconico] Support HTML5-only videos (#13806) version 2017.08.06 Core * Use relative paths for DASH fragments (#12990) Extractors * [pluralsight] Fix format selection - [mpora] Remove extractor (#13826) + [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) * [vlive:channel] Limit number of videos per page to 100 (#13830) * [podomatic] Extend URL regular expression (#13827) * [cinchcast] Extend URL regular expression * [yandexdisk] Relax URL regular expression (#13824) * [vidme] Extract DASH and HLS formats - [teamfour] Remove extractor (#13782) * [pornhd] Fix extraction (#13783) * [udemy] Fix subtitles extraction (#13812) * [mlb] Extend URL regular expression (#13740, #13773) + [pbs] Add support for new URL schema (#13801) * [nrktv] Update API host (#13796) version 2017.07.30.1 Core * [downloader/hls] Use redirect URL as manifest base (#13755) * [options] Correctly hide login info from debug outputs (#13696) Extractors + [watchbox] Add support for watchbox.de (#13739) - [clipfish] Remove extractor + [youjizz] Fix extraction (#13744) + [generic] Add support for another ooyala embed pattern (#13727) + [ard] Add support for lives (#13771) * [soundcloud] Update client id + [soundcloud:trackstation] Add support for track stations (#13733) * [svtplay] Use geo verification proxy for API request * [svtplay] Update API URL (#13767) + [yandexdisk] Add support for yadi.sk (#13755) + [megaphone] Add support for megaphone.fm * [amcnetworks] Make rating optional (#12453) * [cloudy] Fix extraction (#13737) + [nickru] Add support for nickelodeon.ru * [mtv] Improve thumbnail extraction * [nick] Automate geo-restriction bypass (#13711) * [niconico] Improve error reporting (#13696) version 2017.07.23 Core * [YoutubeDL] Improve default format specification (#13704) * [YoutubeDL] Do not override id, extractor and extractor_key for url_transparent entities * [extractor/common] Fix playlist_from_matches Extractors * [itv] Fix production id extraction (#13671, #13703) * [vidio] Make duration non fatal and fix typo * [mtv] Skip missing video parts (#13690) * [sportbox:embed] Fix extraction + [npo] Add support for npo3.nl URLs (#13695) * [dramafever] Remove video id from title (#13699) + [egghead:lesson] Add support for lessons (#6635) * [funnyordie] Extract more metadata (#13677) * [youku:show] Fix playlist extraction (#13248) + [dispeak] Recognize sevt subdomain (#13276) * [adn] Improve error reporting (#13663) * [crunchyroll] Relax series and season regular expression (#13659) + [spiegel:article] Add support for nexx iframe embeds (#13029) + [nexx:embed] Add support for iframe embeds * [nexx] Improve JS embed extraction + [pearvideo] Add support for pearvideo.com (#13031) version 2017.07.15 Core * [YoutubeDL] Don't expand environment variables in meta fields (#13637) Extractors * [spiegeltv] Delegate extraction to nexx extractor (#13159) + [nexx] Add support for nexx.cloud (#10807, #13465) * [generic] Fix rutube embeds extraction (#13641) * [karrierevideos] Fix title extraction (#13641) * [youtube] Don't capture YouTube Red ad for creator meta field (#13621) * [slideshare] Fix extraction (#13617) + [5tv] Add another video URL pattern (#13354, #13606) * [drtv] Make HLS and HDS extraction non fatal * [ted] Fix subtitles extraction (#13628, #13629) * [vine] Make sure the title won't be empty + [twitter] Support HLS streams in vmap URLs + [periscope] Support pscp.tv URLs in embedded frames * [twitter] Extract mp4 urls via mobile API (#12726) * [niconico] Fix authentication error handling (#12486) * [giantbomb] Extract m3u8 formats (#13626) + [vlive:playlist] Add support for playlists (#13613) version 2017.07.09 Core + [extractor/common] Add support for AMP tags in _parse_html5_media_entries + [utils] Support attributes with no values in get_elements_by_attribute Extractors + [dailymail] Add support for embeds + [joj] Add support for joj.sk (#13268) * [abc.net.au:iview] Extract more formats (#13492, #13489) * [egghead:course] Fix extraction (#6635, #13370) + [cjsw] Add support for cjsw.com (#13525) + [eagleplatform] Add support for referrer protected videos (#13557) + [eagleplatform] Add support for another embed pattern (#13557) * [veoh] Extend URL regular expression (#13601) * [npo:live] Fix live stream id extraction (#13568, #13605) * [googledrive] Fix height extraction (#13603) + [dailymotion] Add support for new layout (#13580) - [yam] Remove extractor * [xhamster] Extract all formats and fix duration extraction (#13593) + [xhamster] Add support for new URL schema (#13593) * [espn] Extend URL regular expression (#13244, #13549) * [kaltura] Fix typo in subtitles extraction (#13569) * [vier] Adapt extraction to redesign (#13575) version 2017.07.02 Core * [extractor/common] Improve _json_ld Extractors + [thisoldhouse] Add more fallbacks for video id * [thisoldhouse] Fix video id extraction (#13540, #13541) * [xfileshare] Extend format regular expression (#13536) * [ted] Fix extraction (#13535) + [tastytrade] Add support for tastytrade.com (#13521) * [dplayit] Relax video id regular expression (#13524) + [generic] Extract more generic metadata (#13527) + [bbccouk] Capture and output error message (#13501, #13518) * [cbsnews] Relax video info regular expression (#13284, #13503) + [facebook] Add support for plugin video embeds and multiple embeds (#13493) * [soundcloud] Switch to https for API requests (#13502) * [pandatv] Switch to https for API and download URLs + [pandatv] Add support for https URLs (#13491) + [niconico] Support sp subdomain (#13494) version 2017.06.25 Core + [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472) * [YoutubeDL] Skip malformed formats for better extraction robustness Extractors + [wsj] Add support for barrons.com (#13470) + [ign] Add another video id pattern (#13328) + [raiplay:live] Add support for live streams (#13414) + [redbulltv] Add support for live videos and segments (#13486) + [onetpl] Add support for videos embedded via pulsembed (#13482) * [ooyala] Make more robust * [ooyala] Skip empty format URLs (#13471, #13476) * [hgtv.com:show] Fix typo version 2017.06.23 Core * [adobepass] Fix extraction on older python 2.6 Extractors * [youtube] Adapt to new automatic captions rendition (#13467) * [hgtv.com:show] Relax video config regular expression (#13279, #13461) * [drtuber] Fix formats extraction (#12058) * [youporn] Fix upload date extraction * [youporn] Improve formats extraction * [youporn] Fix title extraction (#13456) * [googledrive] Fix formats sorting (#13443) * [watchindianporn] Fix extraction (#13411, #13415) + [vimeo] Add fallback mp4 extension for original format + [ruv] Add support for ruv.is (#13396) * [viu] Fix extraction on older python 2.6 * [pandora.tv] Fix upload_date extraction (#12846) + [asiancrush] Add support for asiancrush.com (#13420) version 2017.06.18 Core * [downloader/common] Use utils.shell_quote for debug command line * [utils] Use compat_shlex_quote in shell_quote * [postprocessor/execafterdownload] Encode command line (#13407) * [compat] Fix compat_shlex_quote on Windows (#5889, #10254) * [postprocessor/metadatafromtitle] Fix missing optional meta fields processing in --metadata-from-title (#13408) * [extractor/common] Fix json dumping with --geo-bypass + [extractor/common] Improve jwplayer subtitles extraction + [extractor/common] Improve jwplayer formats extraction (#13379) Extractors * [polskieradio] Fix extraction (#13392) + [xfileshare] Add support for fastvideo.me (#13385) * [bilibili] Fix extraction of videos with double quotes in titles (#13387) * [4tube] Fix extraction (#13381, #13382) + [disney] Add support for disneychannel.de (#13383) * [npo] Improve URL regular expression (#13376) + [corus] Add support for showcase.ca + [corus] Add support for history.ca (#13359) version 2017.06.12 Core * [utils] Handle compat_HTMLParseError in extract_attributes (#13349) + [compat] Introduce compat_HTMLParseError * [utils] Improve unified_timestamp * [extractor/generic] Ensure format id is unicode string * [extractor/common] Return unicode string from _match_id + [YoutubeDL] Sanitize more fields (#13313) Extractors + [xfileshare] Add support for rapidvideo.tv (#13348) * [xfileshare] Modernize and pass Referer + [rutv] Add support for testplayer.vgtrk.com (#13347) + [newgrounds] Extract more metadata (#13232) + [newgrounds:playlist] Add support for playlists (#10611) * [newgrounds] Improve formats and uploader extraction (#13346) * [msn] Fix formats extraction * [turbo] Ensure format id is string * [sexu] Ensure height is int * [jove] Ensure comment count is int * [golem] Ensure format id is string * [gfycat] Ensure filesize is int * [foxgay] Ensure height is int * [flickr] Ensure format id is string * [sohu] Fix numeric fields * [safari] Improve authentication detection (#13319) * [liveleak] Ensure height is int (#13313) * [streamango] Make title optional (#13292) * [rtlnl] Improve URL regular expression (#13295) * [tvplayer] Fix extraction (#13291) version 2017.06.05 Core * [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270) Extractors + [bandcamp:weekly] Add support for bandcamp weekly (#12758) * [pornhub:playlist] Fix extraction (#13281) - [godtv] Remove extractor (#13175) * [safari] Fix typo (#13252) * [youtube] Improve chapters extraction (#13247) * [1tv] Lower preference for HTTP formats (#13246) * [francetv] Relax URL regular expression * [drbonanza] Fix extraction (#13231) * [packtpub] Fix authentication (#13240) version 2017.05.29 Extractors * [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs (#13211) * [xhamster] Fix uploader and like/dislike count extraction (#13216)) + [xhamster] Extract categories (#11728) + [abcnews] Add support for embed URLs (#12851) * [gaskrank] Fix extraction (#12493) * [medialaan] Fix videos with missing videoUrl (#12774) * [dvtv] Fix playlist support + [dvtv] Add support for DASH and HLS formats (#3063) + [beam:vod] Add support for beam.pro/mixer.com VODs (#13032)) * [cbsinteractive] Relax URL regular expression (#13213) * [adn] Fix formats extraction + [youku] Extract more metadata (#10433) * [cbsnews] Fix extraction (#13205) version 2017.05.26 Core + [utils] strip_jsonp() can recognize more patterns * [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182) Extractors + [youtube] DASH MPDs with cipher signatures are recognized now (#11381) + [bbc] Add support for authentication * [tudou] Merge into youku extractor (#12214) * [youku:show] Fix extraction * [youku] Fix extraction (#13191) * [udemy] Fix extraction for outputs' format entries without URL (#13192) * [vimeo] Fix formats' sorting (#13189) * [cbsnews] Fix extraction for 60 Minutes videos (#12861) version 2017.05.23 Core + [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) + [adobepass] Add support for Bright House Networks (#13149) Extractors + [streamcz] Add support for subtitles (#13174) * [youtube] Fix DASH manifest signature decryption (#8944, #13156) * [toggle] Relax URL regular expression (#13172) * [toypics] Fix extraction (#13077) * [njpwworld] Fix extraction (#13162, #13169) + [hitbox] Add support for smashcast.tv (#13154) * [mitele] Update app key regular expression (#13158) version 2017.05.18.1 Core * [jsinterp] Fix typo and cleanup regular expressions (#13134) version 2017.05.18 Core + [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125, #13126, #13128, #13129, #13130, #13131, #13132) + [extractor/common] Add support for schemeless URLs in _extract_wowza_formats (#13088, #13092) + [utils] Recognize more audio codecs (#13081) Extractors + [vier] Extract more metadata (#12539) * [vier] Improve extraction (#12801) + Add support for authentication * Bypass authentication when no credentials provided * Improve extraction robustness * [dailymail] Fix sources extraction (#13057) * [dailymotion] Extend URL regular expression (#13079) version 2017.05.14 Core + [extractor/common] Respect Width and Height attributes in ISM manifests + [postprocessor/metadatafromtitle] Add support regular expression syntax for --metadata-from-title (#13065) Extractors + [mediaset] Add support for video.mediaset.it (#12708, #12964) * [orf:radio] Fix extraction (#11643, #12926) * [aljazeera] Extend URL regular expression (#13053) * [imdb] Relax URL regular expression (#13056) + [francetv] Add support for mobile.france.tv (#13068) + [upskill] Add support for upskillcourses.com (#13043) * [thescene] Fix extraction (#13061) * [condenast] Improve embed support * [liveleak] Fix extraction (#12053) + [douyu] Support Douyu shows (#12228) * [myspace] Improve URL regular expression (#13040) * [adultswim] Use desktop platform in assets URL (#13041) version 2017.05.09 Core * [YoutubeDL] Force --restrict-filenames when no locale is set on all python versions (#13027) Extractors * [francetv] Adapt to site redesign (#13034) + [packtpub] Add support for authentication (#12622) * [drtv] Lower preference for SignLanguage formats (#13013, #13016) + [cspan] Add support for brightcove live embeds (#13028) * [vrv] Extract DASH formats and subtitles * [funimation] Fix authentication (#13021) * [adultswim] Fix extraction (#8640, #10950, #11042, #12121) + Add support for Adobe Pass authentication + Add support for live streams + Add support for show pages * [turner] Extract thumbnail, is_live and strip description + [nonktube] Add support for nonktube.com (#8647, #13024) + [nuevo] Pass headers to _extract_nuevo * [nbc] Improve extraction (#12364) version 2017.05.07 Common * [extractor/common] Fix typo in _extract_akamai_formats + [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata + [extractor/common] Introduce chapters meta field Extractors * [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995, #13003) * [bilibili] Fix video downloading (#13001) * [rmcdecouverte] Fix extraction (#12937) * [theplatform] Extract chapters * [bandcamp] Fix thumbnail extraction (#12980) * [pornhub] Extend URL regular expression (#12996) + [youtube] Extract chapters + [nrk] Extract chapters + [vice] Add support for ooyala embeds in article pages + [vice] Support vice articles (#12968) * [vice] Fix extraction for non en_us videos (#12967) * [gdcvault] Fix extraction for some videos (#12733) * [pbs] Improve multipart video support (#12981) * [laola1tv] Fix extraction (#12880) + [cda] Support birthday verification (#12789) * [leeco] Fix extraction (#12974) + [pbs] Extract chapters * [amp] Improve thumbnail and subtitles extraction * [foxsports] Fix extraction (#12945) - [coub] Remove comment count extraction (#12941) version 2017.05.01 Core + [extractor/common] Extract view count from JSON-LD * [utils] Improve unified_timestamp + [utils] Add video/mp2t to mimetype2ext * [downloader/external] Properly handle live stream downloading cancellation (#8932) + [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) Extractors * [infoq] Make audio format extraction non fatal (#12938) * [brightcove] Allow whitespace around attribute names in embedded code + [zaq1] Add support for zaq1.pl (#12693) + [xvideos] Extract duration (#12828) * [vevo] Fix extraction (#12879) + [noovo] Add support for noovo.ca (#12792) + [washingtonpost] Add support for embeds (#12699) * [yandexmusic:playlist] Fix extraction for python 3 (#12888) * [anvato] Improve extraction (#12913) * Promote to regular shortcut based extractor * Add mcp to access key mapping table * Add support for embeds extraction * Add support for anvato embeds in generic extractor * [xtube] Fix extraction for older FLV videos (#12734) * [tvplayer] Fix extraction (#12908) version 2017.04.28 Core + [adobepass] Use geo verification headers for all requests - [downloader/fragment] Remove assert for resume_len when no fragments downloaded + [extractor/common] Add manifest_url for explicit group rendition formats * [extractor/common] Fix manifest_url for m3u8 formats - [extractor/common] Don't list master m3u8 playlists in format list (#12832) Extractor * [aenetworks] Fix extraction for shows with single season + [go] Add support for Disney, DisneyJunior and DisneyXD show pages * [youtube] Recognize new locale-based player URLs (#12885) + [streamable] Add support for new embedded URL schema (#12844) * [arte:+7] Relax URL regular expression (#12837) version 2017.04.26 Core * Introduce --keep-fragments for keeping fragments of fragmented download on disk after download is finished * [YoutubeDL] Fix output template for missing timestamp (#12796) * [socks] Handle cases where credentials are required but missing * [extractor/common] Improve HLS extraction (#12211) * Extract m3u8 parsing to separate method * Improve rendition groups extraction * Build stream name according stream GROUP-ID * Ignore reference to AUDIO group without URI when stream has no CODECS * Use float for scaled tbr in _parse_m3u8_formats * [utils] Add support for TTML styles in dfxp2srt * [downloader/hls] No need to download keys for fragments that have been already downloaded * [downloader/fragment] Improve fragment downloading * Resume immediately * Don't concatenate fragments and decrypt them on every resume * Optimize disk storage usage, don't store intermediate fragments on disk * Store bookkeeping download state file + [extractor/common] Add support for multiple getters in try_get + [extractor/common] Add support for video of WebPage context in _json_ld (#12778) + [extractor/common] Relax JWPlayer regular expression and remove duplicate URLs (#12768) Extractors * [iqiyi] Fix extraction of Yule videos * [vidio] Improve extraction and sort formats + [brightcove] Match only video elements with data-video-id attribute * [iqiyi] Fix playlist detection (#12504) - [azubu] Remove extractor (#12813) * [porn91] Fix extraction (#12814) * [vidzi] Fix extraction (#12793) + [amp] Extract error message (#12795) + [xfileshare] Add support for gorillavid.com and daclips.com (#12776) * [instagram] Fix extraction (#12777) + [generic] Support Brightcove videos in ', webpage, 'embed url')) if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) embed_page = self._download_webpage( embed_url, video_id, headers={'Referer': url}) video_ext = self._get_cookies(embed_url).get('video_ext') if video_ext: video_ext = compat_urllib_parse_unquote(video_ext.value) if not video_ext: video_ext = compat_b64decode(self._search_regex( r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', embed_page, 'video_ext')).decode() video_id, sig, _, access_token = video_ext.split(':') item = self._download_json( 'https://api.vk.com/method/video.get', video_id, headers={'User-Agent': 'okhttp/3.4.1'}, query={ 'access_token': access_token, 'sig': sig, 'v': 5.44, 'videos': video_id, })['response']['items'][0] title = item['title'] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': return self.url_result(f_url) ext, height = f_id.split('_') formats.append({ 'format_id': height + 'p', 'url': f_url, 'height': int_or_none(height), 'ext': ext, }) self._sort_formats(formats) thumbnails = [] for k, v in item.items(): if k.startswith('photo_') and v: width = k.replace('photo_', '') thumbnails.append({ 'id': width, 'url': v, 'width': int_or_none(width), }) return { 'id': video_id, 'title': title, 'formats': formats, 'comment_count': int_or_none(item.get('comments')), 'description': item.get('description'), 'duration': int_or_none(item.get('duration')), 'thumbnails': thumbnails, 'timestamp': int_or_none(item.get('date')), 'uploader': item.get('owner_id'), 'view_count': int_or_none(item.get('views')), } ================================================ FILE: youtube_dl/extractor/bitchute.py ================================================ # coding: utf-8 from __future__ import unicode_literals import itertools import re from .common import InfoExtractor from ..utils import ( orderedSet, unified_strdate, urlencode_postdata, ) class BitChuteIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bitchute.com/video/szoMrox2JEI/', 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb', 'info_dict': { 'id': 'szoMrox2JEI', 'ext': 'mp4', 'title': 'Fuck bitches get money', 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Victoria X Rave', 'upload_date': '20170813', }, }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, }, { 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', }) title = self._html_search_regex( (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'([^<]+)'), webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', default=None) or self._og_search_description(webpage) format_urls = [] for mobj in re.finditer( r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): format_urls.append(mobj.group('url')) format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) formats = [ {'url': format_url} for format_url in orderedSet(format_urls)] if not formats: formats = self._parse_html5_media_entries( url, webpage, video_id)[0]['formats'] self._check_formats(formats, video_id) self._sort_formats(formats) description = self._html_search_regex( r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail( webpage, default=None) or self._html_search_meta( 'twitter:image:src', webpage, 'thumbnail') uploader = self._html_search_regex( (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._search_regex( r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'upload_date': upload_date, 'formats': formats, } class BitChuteChannelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.bitchute.com/channel/victoriaxrave/', 'playlist_mincount': 185, 'info_dict': { 'id': 'victoriaxrave', }, } _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' def _entries(self, channel_id): channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id offset = 0 for page_num in itertools.count(1): data = self._download_json( '%sextend/' % channel_url, channel_id, 'Downloading channel page %d' % page_num, data=urlencode_postdata({ 'csrfmiddlewaretoken': self._TOKEN, 'name': '', 'offset': offset, }), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': channel_url, 'X-Requested-With': 'XMLHttpRequest', 'Cookie': 'csrftoken=%s' % self._TOKEN, }) if data.get('success') is False: break html = data.get('html') if not html: break video_ids = re.findall( r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', html) if not video_ids: break offset += len(video_ids) for video_id in video_ids: yield self.url_result( 'https://www.bitchute.com/video/%s' % video_id, ie=BitChuteIE.ie_key(), video_id=video_id) def _real_extract(self, url): channel_id = self._match_id(url) return self.playlist_result( self._entries(channel_id), playlist_id=channel_id) ================================================ FILE: youtube_dl/extractor/bleacherreport.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from .amp import AMPIE from ..utils import ( ExtractorError, int_or_none, parse_iso8601, ) class BleacherReportIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' _TESTS = [{ 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', 'info_dict': { 'id': '2496438', 'ext': 'mp4', 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', 'uploader_id': 3992341, 'description': 'CFB, ACC, Florida State', 'timestamp': 1434380212, 'upload_date': '20150615', 'uploader': 'Team Stream Now ', }, 'add_ie': ['Ooyala'], }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', 'md5': '6a5cd403418c7b01719248ca97fb0692', 'info_dict': { 'id': '2586817', 'ext': 'webm', 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, 'add_ie': ['Youtube'], }] def _real_extract(self, url): article_id = self._match_id(url) article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] thumbnails = [] primary_photo = article_data.get('primaryPhoto') if primary_photo: thumbnails = [{ 'url': primary_photo['url'], 'width': primary_photo.get('width'), 'height': primary_photo.get('height'), }] info = { '_type': 'url_transparent', 'id': article_id, 'title': article_data['title'], 'uploader': article_data.get('author', {}).get('name'), 'uploader_id': article_data.get('authorId'), 'timestamp': parse_iso8601(article_data.get('createdAt')), 'thumbnails': thumbnails, 'comment_count': int_or_none(article_data.get('commentsCount')), 'view_count': int_or_none(article_data.get('hitCount')), } video = article_data.get('video') if video: video_type = video['type'] if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] elif video_type == 'ooyala.com': info['url'] = 'ooyala:%s' % video['id'] elif video_type == 'youtube.com': info['url'] = video['id'] elif video_type == 'vine.co': info['url'] = 'https://vine.co/v/%s' % video['id'] else: info['url'] = video_type + video['id'] return info else: raise ExtractorError('no video in the article', expected=True) class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', 'upload_date': '20150723', 'timestamp': 1437679032, }, 'expected_warnings': [ 'Unable to download f4m manifest' ] }] def _real_extract(self, url): video_id = self._match_id(url) info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) info['id'] = video_id return info ================================================ FILE: youtube_dl/extractor/blerp.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json from ..utils import ( strip_or_none, traverse_obj, ) from .common import InfoExtractor class BlerpIE(InfoExtractor): IE_NAME = 'blerp' _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', 'info_dict': { 'id': '6320fe8745636cb4dd677a5a', 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', 'uploader': 'luminousaj', 'uploader_id': '5fb81e51aa66ae000c395478', 'ext': 'mp3', 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], } }, { 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', 'info_dict': { 'id': '5bc94ef4796001000498429f', 'title': 'Yee', 'uploader': '179617322678353920', 'uploader_id': '5ba99cf71386730004552c42', 'ext': 'mp3', 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] } }] _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" _GRAPHQL_QUERY = ( '''query webBitePageGetBite($_id: MongoID!) { web { biteById(_id: $_id) { ...bitePageFrag __typename } __typename } } fragment bitePageFrag on Bite { _id title userKeywords keywords color visibility isPremium owned price extraReview isAudioExists image { filename original { url __typename } __typename } userReactions { _id reactions createdAt __typename } topReactions totalSaveCount saved blerpLibraryType license licenseMetaData playCount totalShareCount totalFavoriteCount totalAddedToBoardCount userCategory userAudioQuality audioCreationState transcription userTranscription description createdAt updatedAt author listingType ownerObject { _id username profileImage { filename original { url __typename } __typename } __typename } transcription favorited visibility isCurated sourceUrl audienceRating strictAudienceRating ownerId reportObject { reportedContentStatus __typename } giphy { mp4 gif __typename } audio { filename original { url __typename } mp3 { url __typename } __typename } __typename } ''') def _real_extract(self, url): audio_id = self._match_id(url) data = { 'operationName': self._GRAPHQL_OPERATIONNAME, 'query': self._GRAPHQL_QUERY, 'variables': { '_id': audio_id } } headers = { 'Content-Type': 'application/json' } json_result = self._download_json('https://api.blerp.com/graphql', audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) bite_json = json_result['data']['web']['biteById'] info_dict = { 'id': bite_json['_id'], 'url': bite_json['audio']['mp3']['url'], 'title': bite_json['title'], 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), 'ext': 'mp3', 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) } return info_dict ================================================ FILE: youtube_dl/extractor/bloomberg.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class BloombergIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', # The md5 checksum changes 'info_dict': { 'id': 'qurhIVlJSB6hzkVi229d8g', 'ext': 'flv', 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, 'params': { 'format': 'best[format_id^=hds]', }, }, { # video ID in BPlayer(...) 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', 'info_dict': { 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', 'ext': 'flv', 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', }, 'params': { 'format': 'best[format_id^=hds]', }, }, { # data-bmmrid= 'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money', 'only_matching': True, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, }, { 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', 'only_matching': True, }] def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) video_id = self._search_regex( (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'), webpage, 'id', group='id', default=None) if not video_id: bplayer_data = self._parse_json(self._search_regex( r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) video_id = bplayer_data['id'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') if not stream_url: continue if stream['muxing_format'] == 'TS': formats.extend(self._extract_m3u8_formats( stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_f4m_formats( stream_url, video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } ================================================ FILE: youtube_dl/extractor/bokecc.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ExtractorError class BokeCCBaseIE(InfoExtractor): def _extract_bokecc_formats(self, webpage, video_id, format_id=None): player_params_str = self._html_search_regex( r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', webpage, 'player params', group='query') player_params = compat_parse_qs(player_params_str) info_xml = self._download_xml( 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( player_params['siteid'][0], player_params['vid'][0]), video_id) formats = [{ 'format_id': format_id, 'url': quality.find('./copy').attrib['playurl'], 'preference': int(quality.attrib['value']), } for quality in info_xml.findall('./video/quality')] self._sort_formats(formats) return formats class BokeCCIE(BokeCCBaseIE): IE_DESC = 'CC视频' _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' _TESTS = [{ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A', 'info_dict': { 'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461', 'ext': 'flv', 'title': 'BokeCC Video', }, }] def _real_extract(self, url): qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) if not qs.get('vid') or not qs.get('uid'): raise ExtractorError('Invalid URL', expected=True) video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) webpage = self._download_webpage(url, video_id) return { 'id': video_id, 'title': 'BokeCC Video', # no title provided in the webpage 'formats': self._extract_bokecc_formats(webpage, video_id), } ================================================ FILE: youtube_dl/extractor/bongacams.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, try_get, urlencode_postdata, ) class BongaCamsIE(InfoExtractor): _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://de.bongacams.net/claireashton', 'info_dict': { 'id': 'claireashton', 'ext': 'mp4', 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', 'age_limit': 18, 'uploader_id': 'ClaireAshton', 'uploader': 'ClaireAshton', 'like_count': int, 'is_live': True, }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') channel_id = mobj.group('id') amf = self._download_json( 'https://%s/tools/amf.php' % host, channel_id, data=urlencode_postdata(( ('method', 'getRoomData'), ('args[]', channel_id), ('args[]', 'false'), )), headers={'X-Requested-With': 'XMLHttpRequest'}) server_url = amf['localData']['videoServerUrl'] uploader_id = try_get( amf, lambda x: x['performerData']['username'], compat_str) or channel_id uploader = try_get( amf, lambda x: x['performerData']['displayName'], compat_str) like_count = int_or_none(try_get( amf, lambda x: x['performerData']['loversCount'])) formats = self._extract_m3u8_formats( '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), channel_id, 'mp4', m3u8_id='hls', live=True) self._sort_formats(formats) return { 'id': channel_id, 'title': self._live_title(uploader or uploader_id), 'uploader': uploader, 'uploader_id': uploader_id, 'like_count': like_count, 'age_limit': 18, 'is_live': True, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/bostonglobe.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, ) class BostonGlobeIE(InfoExtractor): _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' _TESTS = [ { 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', 'md5': '0a62181079c85c2d2b618c9a738aedaf', 'info_dict': { 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', 'id': '5320421710001', 'ext': 'mp4', 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', 'timestamp': 1486877593, 'upload_date': '20170212', 'uploader_id': '245991542', }, }, { # Embedded youtube video; we hand it off to the Generic extractor. 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', 'md5': '582b40327089d5c0c949b3c54b13c24b', 'info_dict': { 'title': "Who Is Matt Damon's Favorite Batman?", 'id': 'ZW1QCnlA6Qc', 'ext': 'mp4', 'upload_date': '20170217', 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', 'uploader': 'The Late Late Show with James Corden', 'uploader_id': 'TheLateLateShow', }, 'expected_warnings': ['404'], }, ] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) page_title = self._og_search_title(webpage, default=None) # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> entries = [] for video in re.findall(r'(?i)(<video[^>]+>)', webpage): attrs = extract_attributes(video) video_id = attrs.get('data-brightcove-video-id') account_id = attrs.get('data-account') player_id = attrs.get('data-player') embed = attrs.get('data-embed') if video_id and account_id and player_id and embed: entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)) if len(entries) == 0: return self.url_result(url, 'Generic') elif len(entries) == 1: return self.url_result(entries[0], 'BrightcoveNew') else: return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') ================================================ FILE: youtube_dl/extractor/box.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( determine_ext, parse_iso8601, # try_get, update_url_query, ) class BoxIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' _TEST = { 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', 'info_dict': { 'id': '510727257538', 'ext': 'mp4', 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', 'uploader': 'MLS Video', 'timestamp': 1566320259, 'upload_date': '20190820', 'uploader_id': '235196876', } } def _real_extract(self, url): shared_name, file_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, file_id) request_token = self._parse_json(self._search_regex( r'Box\.config\s*=\s*({.+?});', webpage, 'Box config'), file_id)['requestToken'] access_token = self._download_json( 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, 'Downloading token JSON metadata', data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ 'Content-Type': 'application/json', 'X-Request-Token': request_token, 'X-Box-EndUser-API': 'sharedName=' + shared_name, })[file_id]['read'] shared_link = 'https://app.box.com/s/' + shared_name f = self._download_json( 'https://api.box.com/2.0/files/' + file_id, file_id, 'Downloading file JSON metadata', headers={ 'Authorization': 'Bearer ' + access_token, 'BoxApi': 'shared_link=' + shared_link, 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats }, query={ 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' }) title = f['name'] query = { 'access_token': access_token, 'shared_link': shared_link } formats = [] # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): # entry_url_template = try_get( # entry, lambda x: x['content']['url_template']) # if not entry_url_template: # continue # representation = entry.get('representation') # if representation == 'dash': # TODO: append query to every fragment URL # formats.extend(self._extract_mpd_formats( # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), # file_id, query=query)) authenticated_download_url = f.get('authenticated_download_url') if authenticated_download_url and f.get('is_download_available'): formats.append({ 'ext': f.get('extension') or determine_ext(title), 'filesize': f.get('size'), 'format_id': 'download', 'url': update_url_query(authenticated_download_url, query), }) self._sort_formats(formats) creator = f.get('created_by') or {} return { 'id': file_id, 'title': title, 'formats': formats, 'description': f.get('description') or None, 'uploader': creator.get('name'), 'timestamp': parse_iso8601(f.get('created_at')), 'uploader_id': creator.get('id'), } ================================================ FILE: youtube_dl/extractor/bpb.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( js_to_json, determine_ext, ) class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: video_info = self._parse_json( video_info, video_id, transform_source=js_to_json, fatal=False) if not video_info: continue video_url = video_info.get('src') if not video_url: continue quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, 'format_note': quality, 'format_id': '%s-%s' % (quality, determine_ext(video_url)), }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, 'title': title, 'description': self._og_search_description(webpage), } ================================================ FILE: youtube_dl/extractor/br.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, int_or_none, parse_duration, parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ { 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', 'md5': '83a0477cf0b8451027eb566d88b51106', 'info_dict': { 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', 'title': 'Die böse Überraschung', 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', }, 'skip': '404 not found', }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', 'info_dict': { 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', 'ext': 'flv', 'title': 'Manfred Schreiber ist tot', 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, }, 'skip': '404 not found', }, { 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', 'info_dict': { 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', 'title': 'Kurzweilig und sehr bewegend', 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, }, 'skip': '404 not found', }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', 'info_dict': { 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', 'ext': 'mp4', 'title': 'Umweltbewusster Häuslebauer', 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', 'duration': 116, } }, { 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', 'md5': '23bca295f1650d698f94fc570977dae3', 'info_dict': { 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', 'ext': 'mp4', 'title': 'Folge 1 - Metaphysik', 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', 'upload_date': '20170208', } }, ] def _real_extract(self, url): base_url, display_id = re.search(self._VALID_URL, url).groups() page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') xml = self._download_xml(base_url + xml_url, display_id) medias = [] for xml_media in xml.findall('video') + xml.findall('audio'): media_id = xml_media.get('externalId') media = { 'id': media_id, 'title': xpath_text(xml_media, 'title', 'title', True), 'duration': parse_duration(xpath_text(xml_media, 'duration')), 'formats': self._extract_formats(xpath_element( xml_media, 'assets'), media_id), 'thumbnails': self._extract_thumbnails(xpath_element( xml_media, 'teaserImage/variants'), base_url), 'description': xpath_text(xml_media, 'desc'), 'webpage_url': xpath_text(xml_media, 'permalink'), 'uploader': xpath_text(xml_media, 'author'), } broadcast_date = xpath_text(xml_media, 'broadcastDate') if broadcast_date: media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) medias.append(media) if len(medias) > 1: self._downloader.report_warning( 'found multiple medias; please ' 'report this with the video URL to http://yt-dl.org/bug') if not medias: raise ExtractorError('No media entries found') return medias[0] def _extract_formats(self, assets, media_id): formats = [] for asset in assets.findall('asset'): format_url = xpath_text(asset, ['downloadUrl', 'url']) asset_type = asset.get('type') if asset_type.startswith('HDS'): formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False)) elif asset_type.startswith('HLS'): formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) else: format_info = { 'ext': xpath_text(asset, 'mediaType'), 'width': int_or_none(xpath_text(asset, 'frameWidth')), 'height': int_or_none(xpath_text(asset, 'frameHeight')), 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), 'vcodec': xpath_text(asset, 'codecVideo'), 'acodec': xpath_text(asset, 'codecAudio'), 'container': xpath_text(asset, 'mediaType'), 'filesize': int_or_none(xpath_text(asset, 'size')), } format_url = self._proto_relative_url(format_url) if format_url: http_format_info = format_info.copy() http_format_info.update({ 'url': format_url, 'format_id': 'http-%s' % asset_type, }) formats.append(http_format_info) server_prefix = xpath_text(asset, 'serverPrefix') if server_prefix: rtmp_format_info = format_info.copy() rtmp_format_info.update({ 'url': server_prefix, 'play_path': xpath_text(asset, 'fileName'), 'format_id': 'rtmp-%s' % asset_type, }) formats.append(rtmp_format_info) self._sort_formats(formats) return formats def _extract_thumbnails(self, variants, base_url): thumbnails = [{ 'url': base_url + xpath_text(variant, 'url'), 'width': int_or_none(xpath_text(variant, 'width')), 'height': int_or_none(xpath_text(variant, 'height')), } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', 'md5': 'fdc3d485835966d1622587d08ba632ec', 'info_dict': { 'id': 'av:5a1e6a6e8fce6d001871cc8e', 'ext': 'mp4', 'title': 'Die Sendung vom 28.11.2017', 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', 'timestamp': 1511942766, 'upload_date': '20171129', } }] def _real_extract(self, url): clip_id = self._match_id(url) clip = self._download_json( 'https://proxy-base.master.mango.express/graphql', clip_id, data=json.dumps({ "query": """{ viewer { clip(id: "%s") { title description duration createdAt ageRestriction videoFiles { edges { node { publicLocation fileSize videoProfile { width height bitrate encoding } } } } captionFiles { edges { node { publicLocation } } } teaserImages { edges { node { imageFiles { edges { node { publicLocation width height } } } } } } } } }""" % clip_id}).encode(), headers={ 'Content-Type': 'application/json', })['data']['viewer']['clip'] title = clip['title'] formats = [] for edge in clip.get('videoFiles', {}).get('edges', []): node = edge.get('node', {}) n_url = node.get('publicLocation') if not n_url: continue ext = determine_ext(n_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( n_url, clip_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: video_profile = node.get('videoProfile', {}) tbr = int_or_none(video_profile.get('bitrate')) format_id = 'http' if tbr: format_id += '-%d' % tbr formats.append({ 'format_id': format_id, 'url': n_url, 'width': int_or_none(video_profile.get('width')), 'height': int_or_none(video_profile.get('height')), 'tbr': tbr, 'filesize': int_or_none(node.get('fileSize')), }) self._sort_formats(formats) subtitles = {} for edge in clip.get('captionFiles', {}).get('edges', []): node = edge.get('node', {}) n_url = node.get('publicLocation') if not n_url: continue subtitles.setdefault('de', []).append({ 'url': n_url, }) thumbnails = [] for edge in clip.get('teaserImages', {}).get('edges', []): for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): node = image_edge.get('node', {}) n_url = node.get('publicLocation') if not n_url: continue thumbnails.append({ 'url': n_url, 'width': int_or_none(node.get('width')), 'height': int_or_none(node.get('height')), }) return { 'id': clip_id, 'title': title, 'description': clip.get('description'), 'duration': int_or_none(clip.get('duration')), 'timestamp': parse_iso8601(clip.get('createdAt')), 'age_limit': int_or_none(clip.get('ageRestriction')), 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, } ================================================ FILE: youtube_dl/extractor/bravotv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .adobepass import AdobePassIE from ..utils import ( smuggle_url, update_url_query, int_or_none, ) class BravoTVIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { 'id': 'epL0pmK1kQlT', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, } }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', 'only_matching': True, }] def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), display_id) info = {} query = { 'mbr': 'true', } account_pid, release_pid = [None] * 2 tve = settings.get('ls_tve') if tve: query['manifest'] = 'm3u' mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) if mobj: account_pid, tp_path = mobj.groups() release_pid = tp_path.strip('/').split('/')[-1] else: account_pid = 'HNK2IC' tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) if site == 'bravotv': site = 'bravo' resource = self._get_mvpd_resource( adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( url, release_pid, adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] tp_path = release_pid = metadata.get('release_pid') if not release_pid: release_pid = metadata['guid'] tp_path = 'media/guid/2140479951/' + release_pid info.update({ 'title': metadata['title'], 'description': metadata.get('description'), 'season_number': int_or_none(metadata.get('season_num')), 'episode_number': int_or_none(metadata.get('episode_num')), }) query['switch'] = 'progressive' info.update({ '_type': 'url_transparent', 'id': release_pid, 'url': smuggle_url(update_url_query( 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), query), {'force_smil_url': True}), 'ie_key': 'ThePlatform', }) return info ================================================ FILE: youtube_dl/extractor/breakcom.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( int_or_none, url_or_none, ) class BreakIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { 'id': '2468056', 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', 'age_limit': 13, }, }, { # youtube embed 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', 'info_dict': { 'id': 'RrrDLdeL2HQ', 'ext': 'mp4', 'title': 'Whale Watching Boat Crashing Into San Diego Dock', 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', 'upload_date': '20160331', 'uploader': 'Steve Holden', 'uploader_id': 'sdholden07', }, 'params': { 'skip_download': True, } }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', 'only_matching': True, }] def _real_extract(self, url): display_id, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) content = self._parse_json( self._search_regex( r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, 'content'), display_id) formats = [] for video in content: video_url = url_or_none(video.get('url')) if not video_url: continue bitrate = int_or_none(self._search_regex( r'(\d+)_kbps', video_url, 'tbr', default=None)) formats.append({ 'url': video_url, 'format_id': 'http-%d' % bitrate if bitrate else 'http', 'tbr': bitrate, }) self._sort_formats(formats) title = self._search_regex( (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') def get(key, name): return int_or_none(self._search_regex( r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, default=None)) age_limit = get('ratings', 'age limit') video_id = video_id or get('pid', 'video id') or display_id return { 'id': video_id, 'display_id': display_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': age_limit, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/brightcove.py ================================================ # coding: utf-8 from __future__ import unicode_literals import base64 import re import struct from .adobepass import AdobePassIE from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_HTTPError, compat_parse_qs, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, ) from ..utils import ( clean_html, extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, float_or_none, int_or_none, js_to_json, mimetype2ext, parse_iso8601, smuggle_url, str_or_none, try_get, unescapeHTML, unsmuggle_url, UnsupportedError, update_url_query, url_or_none, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' _TESTS = [ { # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', 'md5': '5423e113865d26e40624dce2e4b45d95', 'note': 'Test Brightcove downloads and detection in GenericIE', 'info_dict': { 'id': '2371591881001', 'ext': 'mp4', 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', 'timestamp': 1368213670, 'upload_date': '20130510', 'uploader_id': '1589608506001', }, 'skip': 'The player has been deactivated by the content owner', }, { # From http://medianetwork.oracle.com/video/player/1785452137001 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', 'info_dict': { 'id': '1785452137001', 'ext': 'flv', 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', 'timestamp': 1344975024, 'upload_date': '20120814', 'uploader_id': '1460825906', }, 'skip': 'video not playable', }, { # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', 'info_dict': { 'id': '2750934548001', 'ext': 'mp4', 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', # 'uploader': 'Mashable', 'timestamp': 1382041798, 'upload_date': '20131017', 'uploader_id': '1130468786001', }, }, { # test that the default referer works # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', 'info_dict': { 'id': '2878862109001', 'ext': 'mp4', 'title': 'Lost in Motion II', 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, 'skip': 'Video gone', }, { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', 'timestamp': 1409122195, 'upload_date': '20140827', 'uploader_id': '710858724001', }, 'skip': 'Video gone', }, { # playlist with 'videoList' # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { 'title': 'Sealife', 'id': '3550319591001', }, 'playlist_mincount': 7, 'skip': 'Unsupported URL', }, { # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', 'info_dict': { 'id': '1522758701001', 'title': 'Lesson 08', }, 'playlist_mincount': 10, 'skip': 'Unsupported URL', }, { # playerID inferred from bcpid # from http://www.un.org/chinese/News/story.asp?NewsID=27724 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', 'only_matching': True, # Tested in GenericIE } ] @classmethod def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing <object class="BrightcoveExperience">{params}</object> """ # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') # remove namespace to simplify extraction object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) try: object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: flashvars = dict( (k, v[0]) for k, v in compat_parse_qs(fv_el.attrib['value']).items()) else: flashvars = {} data_url = object_doc.attrib.get('data', '') data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] return data_url_params.get(name) params = {} playerID = find_param('playerID') or find_param('playerId') if playerID is None: raise ExtractorError('Cannot find player ID') params['playerID'] = playerID playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey # These fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: if isinstance(videoPlayer, list): videoPlayer = videoPlayer[0] videoPlayer = videoPlayer.strip() # UUID is also possible for videoPlayer (e.g. # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd # or http://www8.hp.com/cn/zh/home.html) if not (re.match( r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', videoPlayer) or videoPlayer.startswith('ref:')): return None params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase return cls._make_brightcove_url(params) @classmethod def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove <object /> XML # } m = re.search( r'''(?x)customBC\.createVideo\( .*? # skipping width and height ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters # in length, however it's appended to itself # in places, so truncate ["\'](?P<videoID>\d+)["\'] # @videoPlayer ''', object_js) if m: return cls._make_brightcove_url(m.groupdict()) @classmethod def _make_brightcove_url(cls, params): return update_url_query( 'http://c.brightcove.com/services/viewer/htmlFederated', params) @classmethod def _extract_brightcove_url(cls, webpage): """Try to extract the brightcove url from the webpage, returns None if it can't be found """ urls = cls._extract_brightcove_urls(webpage) return urls[0] if urls else None @classmethod def _extract_brightcove_urls(cls, webpage): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( r'''(?x) <meta\s+ (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+ content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 ''', webpage) if url_m: url = unescapeHTML(url_m.group('url')) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: return [url] matches = re.findall( r'''(?sx)<object (?: [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ ).+?>\s*</object>''', webpage) if matches: return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) # Change the 'videoId' and others field to '@videoPlayer' url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) query_str = mobj.group('query') query = compat_urlparse.parse_qs(query_str) videoPlayer = query.get('@videoPlayer') if videoPlayer: # We set the original url as the default 'Referer' header referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) video_id = videoPlayer[0] if 'playerID' not in query: mobj = re.search(r'/bcpid(\d+)', url) if mobj is not None: query['playerID'] = [mobj.group(1)] publisher_id = query.get('publisherId') if publisher_id and publisher_id[0].isdigit(): publisher_id = publisher_id[0] if not publisher_id: player_key = query.get('playerKey') if player_key and ',' in player_key[0]: player_key = player_key[0] else: player_id = query.get('playerID') if player_id and player_id[0].isdigit(): headers = {} if referer: headers['Referer'] = referer player_page = self._download_webpage( 'http://link.brightcove.com/services/player/bcpid' + player_id[0], video_id, headers=headers, fatal=False) if player_page: player_key = self._search_regex( r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', player_page, 'player key', fatal=False) if player_key: enc_pub_id = player_key.split(',')[1].replace('~', '=') publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] if publisher_id: brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) if referer: brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) # TODO: figure out if it's possible to extract playlistId from playerKey # elif 'playerKey' in query: # player_key = query['playerKey'] # return self._get_playlist_info(player_key[0]) raise UnsupportedError(url) class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', 'description': 'md5:eac376a4fe366edc70279bfb681aea16', 'duration': 165.768, 'timestamp': 1441391203, 'upload_date': '20150904', 'uploader_id': '929656772001', 'formats': 'mincount:20', }, }, { # with rtmp streams 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', 'info_dict': { 'id': '4279049078001', 'ext': 'mp4', 'title': 'Titansgrave: Chapter 0', 'description': 'Titansgrave: Chapter 0', 'duration': 1242.058, 'timestamp': 1433556729, 'upload_date': '20150606', 'uploader_id': '4036320279001', 'formats': 'mincount:39', }, 'params': { # m3u8 download 'skip_download': True, } }, { # playlist stream 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', 'info_dict': { 'id': '5718313430001', 'title': 'No Audio Playlist', }, 'playlist_count': 7, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', 'only_matching': True, }, { # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', 'only_matching': True, }, { # non numeric ref: prefixed video id 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', 'only_matching': True, }, { # unavailable video without message but with error_code 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001', 'only_matching': True, }] @staticmethod def _extract_url(ie, webpage): urls = BrightcoveNewIE._extract_urls(ie, webpage) return urls[0] if urls else None @staticmethod def _extract_urls(ie, webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] # Look for iframe embeds [1] for _, url in re.findall( r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): entries.append(url if url.startswith('http') else 'http:' + url) # Look for <video> tags [2] and embed_in_page embeds [3] # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (<script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js ) )? ''', webpage): attrs = extract_attributes(video) # According to examples from [4] it's unclear whether video id # may be optional and what to do when it is video_id = attrs.get('data-video-id') if not video_id: continue account_id = account_id or attrs.get('data-account') if not account_id: continue player_id = player_id or attrs.get('data-player') or 'default' embed = embed or attrs.get('data-embed') or 'default' bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( account_id, player_id, embed, video_id) # Some brightcove videos may be embedded with video tag only and # without script tag or any mentioning of brightcove at all. Such # embeds are considered ambiguous since they are matched based only # on data-video-id and data-account attributes and in the wild may # not be brightcove embeds at all. Let's check reconstructed # brightcove URLs in case of such embeds and only process valid # ones. By this we ensure there is indeed a brightcove embed. if not script_tag and not ie._is_valid_url( bc_url, video_id, 'possible brightcove video'): continue entries.append(bc_url) return entries def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() num_drm_sources = 0 formats = [] sources = json_data.get('sources') or [] for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object if container == 'WVM' or source.get('key_systems'): num_drm_sources += 1 continue elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'mpd': if not src: continue formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') if not src and not streaming_src and (not stream_name or not app_name): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) width = int_or_none(source.get('width')) f = { 'tbr': tbr, 'filesize': int_or_none(source.get('size')), 'container': container, 'ext': ext or container.lower(), } if width == 0 and height == 0: f.update({ 'vcodec': 'none', }) else: f.update({ 'width': width, 'height': height, 'vcodec': source.get('codec'), }) def build_format_id(kind): format_id = kind if tbr: format_id += '-%dk' % int(tbr) if height: format_id += '-%dp' % height return format_id if src or streaming_src: f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), 'source_preference': 0 if src else -1, }) else: f.update({ 'url': app_name, 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) formats.append(f) if not formats: errors = json_data.get('errors') if errors: error = errors[0] raise ExtractorError( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) if sources and num_drm_sources == len(sources): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) for f in formats: f.setdefault('http_headers', {}).update(headers) subtitles = {} for text_track in json_data.get('text_tracks', []): if text_track.get('kind') != 'captions': continue text_track_url = url_or_none(text_track.get('src')) if not text_track_url: continue lang = (str_or_none(text_track.get('srclang')) or str_or_none(text_track.get('label')) or 'en').lower() subtitles.setdefault(lang, []).append({ 'url': text_track_url, }) is_live = False duration = float_or_none(json_data.get('duration'), 1000) if duration is not None and duration <= 0: is_live = True return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(json_data.get('description')), 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), 'formats': formats, 'subtitles': subtitles, 'tags': json_data.get('tags', []), 'is_live': is_live, } def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) self._initialize_geo_bypass({ 'countries': smuggled_data.get('geo_countries'), 'ip_blocks': smuggled_data.get('geo_ip_blocks'), }) account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() policy_key_id = '%s_%s' % (account_id, player_id) policy_key = self._downloader.cache.load('brightcove', policy_key_id) policy_key_extracted = False store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) config = self._download_json( base_url + 'config.json', video_id, fatal=False) or {} policy_key = try_get( config, lambda x: x['video_cloud']['policy_key']) if not policy_key: webpage = self._download_webpage( base_url + 'index.min.js', video_id) catalog = self._search_regex( r'catalog\(({.+?})\);', webpage, 'catalog', default=None) if catalog: catalog = self._parse_json( js_to_json(catalog), video_id, fatal=False) if catalog: policy_key = catalog.get('policyKey') if not policy_key: policy_key = self._search_regex( r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', webpage, 'policy key', group='pk') store_pk(policy_key) return policy_key api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) headers = {} referrer = smuggled_data.get('referrer') if referrer: headers.update({ 'Referer': referrer, 'Origin': re.search(r'https?://[^/]+', referrer).group(0), }) for _ in range(2): if not policy_key: policy_key = extract_policy_key() policy_key_extracted = True headers['Accept'] = 'application/json;pk=%s' % policy_key try: json_data = self._download_json(api_url, video_id, headers=headers) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): json_data = self._parse_json(e.cause.read().decode(), video_id)[0] message = json_data.get('message') or json_data['error_code'] if json_data.get('error_subcode') == 'CLIENT_GEO': self.raise_geo_restricted(msg=message) elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: policy_key = None store_pk(None) continue raise ExtractorError(message, expected=True) raise errors = json_data.get('errors') if errors and errors[0].get('error_subcode') == 'TVE_AUTH': custom_fields = json_data['custom_fields'] tve_token = self._extract_mvpd_auth( smuggled_data['source_url'], video_id, custom_fields['bcadobepassrequestorid'], custom_fields['bcadobepassresourceid']) json_data = self._download_json( api_url, video_id, headers={ 'Accept': 'application/json;pk=%s' % policy_key }, query={ 'tveToken': tve_token, }) if content_type == 'playlist': return self.playlist_result( [self._parse_brightcove_metadata(vid, vid.get('id'), headers) for vid in json_data.get('videos', []) if vid.get('id')], json_data.get('id'), json_data.get('name'), json_data.get('description')) return self._parse_brightcove_metadata( json_data, video_id, headers=headers) ================================================ FILE: youtube_dl/extractor/businessinsider.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from .jwplatform import JWPlatformIE class BusinessInsiderIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', 'info_dict': { 'id': 'cjGDb0X9', 'ext': 'mp4', 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', 'upload_date': '20160611', 'timestamp': 1465675620, }, }, { 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', 'md5': '43f438dbc6da0b89f5ac42f68529d84a', 'info_dict': { 'id': '5zJwd4FK', 'ext': 'mp4', 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', 'description': 'md5:2af8975825d38a4fed24717bbe51db49', 'upload_date': '20170705', 'timestamp': 1499270528, }, }, { 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) jwplatform_id = self._search_regex( (r'data-media-id=["\']([a-zA-Z0-9]{8})', r'id=["\']jwplayer_([a-zA-Z0-9]{8})', r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), webpage, 'jwplatform id') return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), video_id=video_id) ================================================ FILE: youtube_dl/extractor/buzzfeed.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia', 'info_dict': { 'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss', 'title': 'This Angry Ram Destroys A Punching Bag Like A Boss', 'description': 'Rambro!', }, 'playlist': [{ 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', 'title': 'Angry Ram destroys a punching bag..', 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', 'uploader': 'Angry Ram', } }] }, { 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia', 'params': { 'skip_download': True, # Got enough YouTube download tests }, 'info_dict': { 'id': 'look-at-this-cute-dog-omg', 'description': 're:Munchkin the Teddy Bear is back ?!', 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill', }, 'playlist': [{ 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', 'title': 're:Munchkin the Teddy Bear gets her exercise', 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', 'uploader': 're:^Munchkin the', }, }] }, { 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', 'info_dict': { 'id': 'the-most-adorable-crash-landing-ever', 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', 'description': 'This gosling knows how to stick a landing.', }, 'playlist': [{ 'md5': '763ca415512f91ca62e4621086900a23', 'info_dict': { 'id': '971793786185728', 'ext': 'mp4', 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', 'uploader': 'Calgary Outdoor Centre-University of Calgary', }, }], 'add_ie': ['Facebook'], }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) all_buckets = re.findall( r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'', webpage) entries = [] for bd_json in all_buckets: bd = json.loads(bd_json) video = bd.get('video') or bd.get('progload_video') if not video: continue entries.append(self.url_result(video['url'])) facebook_urls = FacebookIE._extract_urls(webpage) entries.extend([ self.url_result(facebook_url) for facebook_url in facebook_urls]) return { '_type': 'playlist', 'id': playlist_id, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'entries': entries, } ================================================ FILE: youtube_dl/extractor/byutv.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, merge_dicts, parse_duration, url_or_none, ) class BYUtvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ # ooyalaVOD 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', 'display_id': 'studio-c-season-5-episode-5', 'ext': 'mp4', 'title': 'Season 5 Episode 5', 'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65', 'thumbnail': r're:^https?://.*', 'duration': 1486.486, }, 'params': { 'skip_download': True, }, 'add_ie': ['Ooyala'], }, { # dvr 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', 'info_dict': { 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', 'ext': 'mp4', 'title': 'Pacific vs. BYU (4/12/19)', 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', 'duration': 11645, }, 'params': { 'skip_download': True }, }, { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', 'only_matching': True, }, { 'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id video = self._download_json( 'https://api.byutv.org/api3/catalog/getvideosforcontent', display_id, query={ 'contentid': video_id, 'channel': 'byutv', 'x-byutv-context': 'web$US', }, headers={ 'x-byutv-context': 'web$US', 'x-byutv-platformkey': 'xsaaw9c7y5', }) ep = video.get('ooyalaVOD') if ep: return { '_type': 'url_transparent', 'ie_key': 'Ooyala', 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, 'display_id': display_id, 'title': ep.get('title'), 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), } info = {} formats = [] for format_id, ep in video.items(): if not isinstance(ep, dict): continue video_url = url_or_none(ep.get('videoUrl')) if not video_url: continue ext = determine_ext(video_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) else: formats.append({ 'url': video_url, 'format_id': format_id, }) merge_dicts(info, { 'title': ep.get('title'), 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), 'duration': parse_duration(ep.get('length')), }) self._sort_formats(formats) return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': display_id, 'formats': formats, }) ================================================ FILE: youtube_dl/extractor/c56.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import js_to_json class C56IE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' IE_NAME = '56.com' _TESTS = [{ 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { 'id': '93440716', 'ext': 'flv', 'title': '网事知多少 第32期:车怒', 'duration': 283.813, }, }, { 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', 'md5': '', 'info_dict': { 'id': '82247482', 'title': '爱的诅咒之杜鹃花开', }, 'playlist_count': 7, 'add_ie': ['Sohu'], }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') webpage = self._download_webpage(url, text_id) sohu_video_info_str = self._search_regex( r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) if sohu_video_info_str: sohu_video_info = self._parse_json( sohu_video_info_str, text_id, transform_source=js_to_json) return self.url_result(sohu_video_info['url'], 'Sohu') page = self._download_json( 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') info = page['info'] formats = [ { 'format_id': f['type'], 'filesize': int(f['filesize']), 'url': f['url'] } for f in info['rfiles'] ] self._sort_formats(formats) return { 'id': info['vid'], 'title': info['Subject'], 'duration': int(info['duration']) / 1000.0, 'formats': formats, 'thumbnail': info.get('bimg') or info.get('img'), } ================================================ FILE: youtube_dl/extractor/caffeine.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, merge_dicts, parse_iso8601, T, traverse_obj, txt_or_none, urljoin, ) class CaffeineTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?caffeine\.tv/[^/]+/video/(?P<id>[0-9a-f-]+)' _TESTS = [{ 'url': 'https://www.caffeine.tv/TsuSurf/video/cffc0a00-e73f-11ec-8080-80017d29f26e', 'info_dict': { 'id': 'cffc0a00-e73f-11ec-8080-80017d29f26e', 'ext': 'mp4', 'title': 'GOOOOD MORNINNNNN #highlights', 'timestamp': 1654702180, 'upload_date': '20220608', 'uploader': 'TsuSurf', 'duration': 3145, 'age_limit': 17, }, 'params': { 'format': 'bestvideo', }, }] def _real_extract(self, url): video_id = self._match_id(url) json_data = self._download_json( 'https://api.caffeine.tv/social/public/activity/' + video_id, video_id) broadcast_info = traverse_obj(json_data, ('broadcast_info', T(dict))) or {} title = broadcast_info['broadcast_title'] video_url = broadcast_info['video_url'] ext = determine_ext(video_url) if ext == 'm3u8': formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8', fatal=False) else: formats = [{'url': video_url}] self._sort_formats(formats) return merge_dicts({ 'id': video_id, 'title': title, 'formats': formats, }, traverse_obj(json_data, { 'uploader': ((None, 'user'), 'username'), }, get_all=False), traverse_obj(json_data, { 'like_count': ('like_count', T(int_or_none)), 'view_count': ('view_count', T(int_or_none)), 'comment_count': ('comment_count', T(int_or_none)), 'tags': ('tags', Ellipsis, T(txt_or_none)), 'is_live': 'is_live', 'uploader': ('user', 'name'), }), traverse_obj(broadcast_info, { 'duration': ('content_duration', T(int_or_none)), 'timestamp': ('broadcast_start_time', T(parse_iso8601)), 'thumbnail': ('preview_image_path', T(lambda u: urljoin(url, u))), 'age_limit': ('content_rating', T(lambda r: r and { # assume Apple Store ratings [1] # 1. https://en.wikipedia.org/wiki/Mobile_software_content_rating_system 'FOUR_PLUS': 0, 'NINE_PLUS': 9, 'TWELVE_PLUS': 12, 'SEVENTEEN_PLUS': 17, }.get(r, 17))), })) ================================================ FILE: youtube_dl/extractor/callin.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, traverse_obj, try_get, ) class CallinIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?:[^/#?-]+-)*(?P<id>[^/#?-]+)' _TESTS = [{ 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', 'md5': '14ede27ee2c957b7e4db93140fc0745c', 'info_dict': { 'id': 'PrumRdSQJW', 'ext': 'mp4', 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', 'description': 'Or, why the government doesn’t like SpaceX', 'channel': 'The Pull Request', 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', } }, { 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', 'md5': '16f704ddbf82a27e3930533b12062f07', 'info_dict': { 'id': 'lzxMidUnjA', 'ext': 'mp4', 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', 'channel': 'The DEBRIEF With Briahna Joy Gray', 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', } }] def _search_nextjs_data(self, webpage, video_id, transform_source=None, fatal=True, **kw): return self._parse_json( self._search_regex( r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) next_data = self._search_nextjs_data(webpage, video_id) episode = traverse_obj(next_data, ('props', 'pageProps', 'episode'), expected_type=dict) if not episode: raise ExtractorError('Failed to find episode data') title = episode.get('title') or self._og_search_title(webpage) description = episode.get('description') or self._og_search_description(webpage) formats = [] formats.extend(self._extract_m3u8_formats( episode.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) channel = try_get(episode, lambda x: x['show']['title'], compat_str) channel_url = try_get(episode, lambda x: x['show']['linkObj']['resourceUrl'], compat_str) return { 'id': video_id, 'title': title, 'description': description, 'formats': formats, 'channel': channel, 'channel_url': channel_url, } ================================================ FILE: youtube_dl/extractor/camdemy.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( clean_html, parse_duration, str_to_int, unified_strdate, ) class CamdemyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', 'md5': '5a5562b6a98b37873119102e052e311b', 'info_dict': { 'id': '5181', 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': r're:^https?://.*\.jpg$', 'creator': 'ss11spring', 'duration': 1591, 'upload_date': '20130114', 'view_count': int, } }, { # With non-empty description # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { 'id': '13885', 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', 'duration': 318, } }, { # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', 'title': 'Excel 2013 Tutorial - How to add Password Protection', 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', 'uploader_id': 'hunkimtutorials', }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) # Some URLs return "No permission or not login" in a webpage despite being # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) upload_date = unified_strdate(self._search_regex( r'>published on ([^<]+)<', webpage, 'upload date', default=None)) view_count = str_to_int(self._search_regex( r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', webpage, 'view count', default=None)) description = self._html_search_meta( 'description', webpage, default=None) or clean_html( oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, 'title': title, 'thumbnail': thumb_url, 'description': description, 'creator': oembed_obj.get('author_name'), 'duration': parse_duration(oembed_obj.get('duration')), 'upload_date': upload_date, 'view_count': view_count, } class CamdemyFolderIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', 'info_dict': { 'id': '450', 'title': '信號與系統 2012 & 2011 (Signals and Systems)', }, 'playlist_mincount': 145 }, { # links without trailing slash # and multi-page 'url': 'http://www.camdemy.com/folder/853', 'info_dict': { 'id': '853', 'title': '科學計算 - 使用 Matlab' }, 'playlist_mincount': 20 }, { # with displayMode parameter. For testing the codes to add parameters 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg', 'info_dict': { 'id': '853', 'title': '科學計算 - 使用 Matlab' }, 'playlist_mincount': 20 }] def _real_extract(self, url): folder_id = self._match_id(url) # Add displayMode=list so that all links are displayed in a single page parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) parsed_url[4] = compat_urllib_parse_urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) matches = re.findall(r"href='(/media/\d+/?)'", page) entries = [self.url_result('http://www.camdemy.com' + media_path) for media_path in matches] folder_title = self._html_search_meta('keywords', page) return self.playlist_result(entries, folder_id, folder_title) ================================================ FILE: youtube_dl/extractor/cammodels.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, url_or_none, ) class CamModelsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.cammodels.com/cam/AutumnKnight/', 'only_matching': True, 'age_limit': 18 }] def _real_extract(self, url): user_id = self._match_id(url) manifest = self._download_json( 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue encodings = format_dict.get('encodings') if not isinstance(encodings, list): continue vcodec = format_dict.get('videoCodec') acodec = format_dict.get('audioCodec') for media in encodings: if not isinstance(media, dict): continue media_url = url_or_none(media.get('location')) if not media_url: continue format_id_list = [format_id] height = int_or_none(media.get('videoHeight')) if height is not None: format_id_list.append('%dp' % height) f = { 'url': media_url, 'format_id': '-'.join(format_id_list), 'width': int_or_none(media.get('videoWidth')), 'height': height, 'vbr': int_or_none(media.get('videoKbps')), 'abr': int_or_none(media.get('audioKbps')), 'fps': int_or_none(media.get('fps')), 'vcodec': vcodec, 'acodec': acodec, } if 'rtmp' in format_id: f['ext'] = 'flv' elif 'hls' in format_id: f.update({ 'ext': 'mp4', # hls skips fragments, preferring rtmp 'preference': -1, }) else: if format_id == 'jpeg': thumbnails.append({ 'url': f['url'], 'width': f['width'], 'height': f['height'], 'format_id': f['format_id'], }) continue formats.append(f) self._sort_formats(formats) return { 'id': user_id, 'title': self._live_title(user_id), 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 } ================================================ FILE: youtube_dl/extractor/camtube.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, unified_timestamp, ) class CamTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', 'info_dict': { 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', 'display_id': 'minafay-030618-1136-chaturbate-female', 'ext': 'mp4', 'title': 'minafay-030618-1136-chaturbate-female', 'duration': 1274, 'timestamp': 1528018608, 'upload_date': '20180603', 'age_limit': 18 }, 'params': { 'skip_download': True, }, }] _API_BASE = 'https://api.camtube.co' def _real_extract(self, url): display_id = self._match_id(url) token = self._download_json( '%s/rpc/session/new' % self._API_BASE, display_id, 'Downloading session token')['token'] self._set_cookie('api.camtube.co', 'session', token) video = self._download_json( '%s/recordings/%s' % (self._API_BASE, display_id), display_id, headers={'Referer': url}) video_id = video['uuid'] timestamp = unified_timestamp(video.get('createdAt')) duration = int_or_none(video.get('duration')) view_count = int_or_none(video.get('viewCount')) like_count = int_or_none(video.get('likeCount')) creator = video.get('stageName') formats = [{ 'url': '%s/recordings/%s/manifest.m3u8' % (self._API_BASE, video_id), 'format_id': 'hls', 'ext': 'mp4', 'protocol': 'm3u8_native', }] return { 'id': video_id, 'display_id': display_id, 'title': display_id, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'creator': creator, 'formats': formats, 'age_limit': 18 } ================================================ FILE: youtube_dl/extractor/camwithher.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, unified_strdate, ) class CamWithHerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' _TESTS = [{ 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', 'info_dict': { 'id': '5644', 'ext': 'flv', 'title': 'Periscope Tease', 'description': 'In the clouds teasing on periscope to my favorite song', 'duration': 240, 'view_count': int, 'comment_count': int, 'uploader': 'MileenaK', 'upload_date': '20160322', 'age_limit': 18, }, 'params': { 'skip_download': True, } }, { 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', 'only_matching': True, }, { 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', 'only_matching': True, }, { 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) flv_id = self._html_search_regex( r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') # Video URL construction algorithm is reverse-engineered from cwhplayer.swf rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) title = self._html_search_regex( r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') description = self._html_search_regex( r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) runtime = self._search_regex( r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) if runtime: runtime = re.sub(r'[\s-]', '', runtime) duration = parse_duration(runtime) view_count = int_or_none(self._search_regex( r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) comment_count = int_or_none(self._search_regex( r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) uploader = self._search_regex( r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) upload_date = unified_strdate(self._search_regex( r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) return { 'id': flv_id, 'url': rtmp_url, 'ext': 'flv', 'no_resume': True, 'title': title, 'description': description, 'duration': duration, 'view_count': view_count, 'comment_count': comment_count, 'uploader': uploader, 'upload_date': upload_date, 'age_limit': 18 } ================================================ FILE: youtube_dl/extractor/canalc2.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import parse_duration class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.canalc2.tv/video/%s' % video_id, video_id) title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', webpage, 'title') formats = [] for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage): if video_url.startswith('rtmp://'): rtmp = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url) formats.append({ 'url': rtmp.group('url'), 'format_id': 'rtmp', 'ext': 'flv', 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), 'page_url': url, }) else: formats.append({ 'url': video_url, 'format_id': 'http', }) if formats: info = { 'formats': formats, } else: info = self._parse_html5_media_entries(url, webpage, url)[0] self._sort_formats(info['formats']) info.update({ 'id': video_id, 'title': title, 'duration': parse_duration(self._search_regex( r'id=["\']video_duree["\'][^>]*>([^<]+)', webpage, 'duration', fatal=False)), }) return info ================================================ FILE: youtube_dl/extractor/canalplus.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( # ExtractorError, # HEADRequest, int_or_none, qualities, unified_strdate, ) class CanalplusIE(InfoExtractor): IE_DESC = 'mycanal.fr and piwiplus.fr' _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { 'mycanal': 'cplus', 'piwiplus': 'teletoon', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { 'id': '1397061', 'display_id': 'lolywood', 'ext': 'mp4', 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', 'upload_date': '20160602', }, }, { # geo restricted, bypassed 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', 'info_dict': { 'id': '1108190', 'display_id': 'pid1405-le-labyrinthe-boing-super-ranger', 'ext': 'mp4', 'title': 'BOING SUPER RANGER - Ep : Le labyrinthe', 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }] def _real_extract(self, url): site, display_id, video_id = re.match(self._VALID_URL, url).groups() site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') if isinstance(video_data, list): video_data = [video for video in video_data if video.get('ID') == video_id][0] media = video_data['MEDIA'] infos = video_data['INFOS'] preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) # _, fmt_url = next(iter(media['VIDEOS'].items())) # if '/geo' in fmt_url.lower(): # response = self._request_webpage( # HEADRequest(fmt_url), video_id, # 'Checking if the video is georestricted') # if '/blocage' in response.geturl(): # raise ExtractorError( # 'The video is not available in your country', # expected=True) formats = [] for format_id, format_url in media['VIDEOS'].items(): if not format_url: continue if format_id == 'HLS': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) elif format_id == 'HDS': formats.extend(self._extract_f4m_formats( format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), }) self._sort_formats(formats) thumbnails = [{ 'id': image_id, 'url': image_url, } for image_id, image_url in media.get('images', {}).items()] titrage = infos['TITRAGE'] return { 'id': video_id, 'display_id': display_id, 'title': '%s - %s' % (titrage['TITRE'], titrage['SOUS_TITRE']), 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), 'thumbnails': thumbnails, 'description': infos.get('DESCRIPTION'), 'duration': int_or_none(infos.get('DURATION')), 'view_count': int_or_none(infos.get('NB_VUES')), 'like_count': int_or_none(infos.get('NB_LIKES')), 'comment_count': int_or_none(infos.get('NB_COMMENTS')), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/canvas.py ================================================ from __future__ import unicode_literals import re import json from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( ExtractorError, clean_html, extract_attributes, float_or_none, get_element_by_class, int_or_none, merge_dicts, str_or_none, strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', 'info_dict': { 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'ext': 'mp4', 'title': 'Nachtwacht: De Greystook', 'description': 'Nachtwacht: De Greystook', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1468.04, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', } _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') data = None if site_id != 'vrtvideo': # Old API endpoint, serves more formats but may fail for some videos data = self._download_json( 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), video_id, 'Downloading asset JSON', 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: headers = self.geo_verification_headers() headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) if not data.get('title'): code = data.get('code') if code == 'AUTHENTICATION_REQUIRED': self.raise_login_required() elif code == 'INVALID_LOCATION': self.raise_geo_restricted(countries=['BE']) raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') formats = [] for target in data['targetUrls']: format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) if not format_url or not format_type: continue format_type = format_type.upper() if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], m3u8_id=format_type, fatal=False)) elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( format_url, video_id, f4m_id=format_type, fatal=False)) elif format_type == 'MPEG_DASH': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id=format_type, fatal=False)) elif format_type == 'HSS': formats.extend(self._extract_ism_formats( format_url, video_id, ism_id='mss', fatal=False)) else: formats.append({ 'format_id': format_type, 'url': format_url, }) self._sort_formats(formats) subtitles = {} subtitle_urls = data.get('subtitleUrls') if isinstance(subtitle_urls, list): for subtitle in subtitle_urls: subtitle_url = subtitle.get('url') if subtitle_url and subtitle.get('type') == 'CLOSED': subtitles.setdefault('nl', []).append({'url': subtitle_url}) return { 'id': video_id, 'display_id': video_id, 'title': title, 'description': description, 'formats': formats, 'duration': float_or_none(data.get('duration'), 1000), 'thumbnail': data.get('posterImageUrl'), 'subtitles': subtitles, } class CanvasEenIE(InfoExtractor): IE_DESC = 'canvas.be and een.be' _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', 'md5': 'ed66976748d12350b118455979cca293', 'info_dict': { 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', 'ext': 'flv', 'title': 'De afspraak veilt voor de Warmste Week', 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 49.02, }, 'expected_warnings': ['is not a supported codec'], }, { # with subtitles 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', 'info_dict': { 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', 'display_id': 'pieter-0167', 'ext': 'mp4', 'title': 'Pieter 0167', 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2553.08, 'subtitles': { 'nl': [{ 'ext': 'vtt', }], }, }, 'params': { 'skip_download': True, }, 'skip': 'Pagina niet gevonden', }, { 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', 'info_dict': { 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', 'display_id': 'emma-pakt-thilly-aan', 'ext': 'mp4', 'title': 'Emma pakt Thilly aan', 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 118.24, }, 'params': { 'skip_download': True, }, 'expected_warnings': ['is not a supported codec'], }, { 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, display_id = mobj.group('site_id'), mobj.group('id') webpage = self._download_webpage(url, display_id) title = strip_or_none(self._search_regex( r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', webpage, 'title', default=None) or self._og_search_title( webpage, default=None)) video_id = self._html_search_regex( r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') return { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), } class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' _TESTS = [{ # Available via old API endpoint 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', 'series': 'Postbus X', 'season': 'Seizoen 1989', 'season_number': 1989, 'episode': 'De zwarte weduwe', 'episode_number': 1, 'timestamp': 1595822400, 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { 'username': '<snip>', 'password': '<snip>', }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', 'info_dict': { 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', 'ext': 'mp4', 'title': 'Aflevering 5', 'description': 'Wie valt door de mand tijdens een missie?', 'duration': 2967.06, 'season': 'Season 1', 'season_number': 1, 'episode_number': 5, }, 'skip': 'This video is only available for registered users', 'params': { 'username': '<snip>', 'password': '<snip>', }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return auth_data = { 'APIKey': self._APIKEY, 'targetEnv': 'jssdk', 'loginID': username, 'password': password, 'authMode': 'cookie', } auth_info = self._gigya_login(auth_data) # Sometimes authentication fails for no good reason, retry login_attempt = 1 while login_attempt <= 3: try: # When requesting a token, no actual token is returned, but the # necessary cookies are set. self._request_webpage( 'https://token.vrt.be', None, note='Requesting a token', errnote='Could not get a token', headers={ 'Content-Type': 'application/json', 'Referer': 'https://www.vrt.be/vrtnu/', }, data=json.dumps({ 'uid': auth_info['UID'], 'uidsig': auth_info['UIDSignature'], 'ts': auth_info['signatureTimestamp'], 'email': auth_info['profile']['email'], }).encode('utf-8')) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: login_attempt += 1 self.report_warning('Authentication failed') self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') else: raise e else: break def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) attrs = extract_attributes(self._search_regex( r'(<nui-media[^>]+>)', webpage, 'media element')) video_id = attrs['videoid'] publication_id = attrs.get('publicationid') if publication_id: video_id = publication_id + '$' + video_id page = (self._parse_json(self._search_regex( r'digitalData\s*=\s*({.+?});', webpage, 'digial data', default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) class DagelijkseKostIE(InfoExtractor): IE_DESC = 'dagelijksekost.een.be' _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', 'md5': '30bfffc323009a3e5f689bef6efa2365', 'info_dict': { 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', 'display_id': 'hachis-parmentier-met-witloof', 'ext': 'mp4', 'title': 'Hachis parmentier met witloof', 'description': 'md5:9960478392d87f63567b5b117688cdc5', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 283.02, }, 'expected_warnings': ['is not a supported codec'], } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) title = strip_or_none(get_element_by_class( 'dish-metadata__title', webpage ) or self._html_search_meta( 'twitter:title', webpage)) description = clean_html(get_element_by_class( 'dish-description', webpage) ) or self._html_search_meta( ('description', 'twitter:description', 'og:description'), webpage) video_id = self._html_search_regex( r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') return { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, } ================================================ FILE: youtube_dl/extractor/carambatv.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, int_or_none, try_get, ) from .videomore import VideomoreIE class CarambaTVIE(InfoExtractor): _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://video1.carambatv.ru/v/191910501', 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', 'info_dict': { 'id': '191910501', 'ext': 'mp4', 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2678.31, }, }, { 'url': 'carambatv:191910501', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, video_id) title = video['title'] base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id formats = [{ 'url': base_url + f['fn'], 'height': int_or_none(f.get('height')), 'format_id': '%sp' % f['height'] if f.get('height') else None, } for f in video['qualities'] if f.get('fn')] self._sort_formats(formats) thumbnail = video.get('splash') duration = float_or_none(try_get( video, lambda x: x['annotations'][0]['end_time'], compat_str)) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, } class CarambaTVPageIE(InfoExtractor): _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', 'info_dict': { 'id': '475222', 'ext': 'flv', 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', 'thumbnail': r're:^https?://.*\.jpg', # duration reported by videomore is incorrect 'duration': int, }, 'add_ie': [VideomoreIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) videomore_url = VideomoreIE._extract_url(webpage) if not videomore_url: videomore_id = self._search_regex( r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', default=None) if videomore_id: videomore_url = 'videomore:%s' % videomore_id if videomore_url: title = self._og_search_title(webpage) return { '_type': 'url_transparent', 'url': videomore_url, 'ie_key': VideomoreIE.ie_key(), 'title': title, } video_url = self._og_search_property('video:iframe', webpage, default=None) if not video_url: video_id = self._search_regex( r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', webpage, 'video id') video_url = 'carambatv:%s' % video_id return self.url_result(video_url, CarambaTVIE.ie_key()) ================================================ FILE: youtube_dl/extractor/cartoonnetwork.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .turner import TurnerBaseIE from ..utils import int_or_none class CartoonNetworkIE(TurnerBaseIE): _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' _TEST = { 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', 'info_dict': { 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', 'ext': 'mp4', 'title': 'How to Draw Upgrade', 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): metadata_re = '' if content_re: metadata_re = r'|video_metadata\.content_' + content_re return self._search_regex( r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), webpage, name, fatal=fatal) media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) info = self._extract_ngtv_info( media_id, {'networkId': 'cartoonnetwork'}, { 'url': url, 'site_name': 'CartoonNetwork', 'auth_required': find_field('authType', 'auth type') != 'unauth', }) series = find_field( 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) info.update({ 'id': media_id, 'display_id': display_id, 'title': title, 'description': self._html_search_meta('description', webpage), 'series': series, 'episode': title, }) for field in ('season', 'episode'): field_name = field + 'Number' info[field + '_number'] = int_or_none(find_field( field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) return info ================================================ FILE: youtube_dl/extractor/cbc.py ================================================ # coding: utf-8 from __future__ import unicode_literals import hashlib import json import re from xml.sax.saxutils import escape from .common import InfoExtractor from ..compat import ( compat_str, compat_HTTPError, ) from ..utils import ( js_to_json, smuggle_url, try_get, xpath_text, xpath_element, xpath_with_ns, find_xpath_attr, orderedSet, parse_duration, parse_iso8601, parse_age_limit, strip_or_none, int_or_none, ExtractorError, ) class CBCIE(InfoExtractor): IE_NAME = 'cbc.ca' _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ # with mediaId 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', 'md5': '97e24d09672fc4cf56256d6faa6c25bc', 'info_dict': { 'id': '2682904050', 'ext': 'mp4', 'title': 'Don Cherry – All-Stars', 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', 'timestamp': 1454463000, 'upload_date': '20160203', 'uploader': 'CBCC-NEW', }, 'skip': 'Geo-restricted to Canada', }, { # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', 'md5': '162adfa070274b144f4fdc3c3b8207db', 'info_dict': { 'id': '2414435309', 'ext': 'mp4', 'title': '22 Minutes Update: What Not To Wear Quebec', 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", 'upload_date': '20131025', 'uploader': 'CBCC-NEW', 'timestamp': 1382717907, }, }, { # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { 'id': '2487345465', 'ext': 'mp4', 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', 'upload_date': '19780210', 'uploader': 'CBCC-NEW', 'timestamp': 255977160, }, }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', 'playlist': [{ 'md5': '377572d0b49c4ce0c9ad77470e0b96b4', 'info_dict': { 'id': '2680832926', 'ext': 'mp4', 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', 'upload_date': '20160201', 'timestamp': 1454342820, 'uploader': 'CBCC-NEW', }, }, { 'md5': '415a0e3f586113894174dfb31aa5bb1a', 'info_dict': { 'id': '2658915080', 'ext': 'mp4', 'title': 'Fly like an eagle!', 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', 'upload_date': '20150315', 'timestamp': 1426443984, 'uploader': 'CBCC-NEW', }, }], 'skip': 'Geo-restricted to Canada', }, { # multiple CBC.APP.Caffeine.initInstance(...) 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', 'info_dict': { 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', 'id': 'dog-indoor-exercise-winter-1.3928238', 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, 'playlist_mincount': 6, }] @classmethod def suitable(cls, url): return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) def _extract_player_init(self, player_init, display_id): player_info = self._parse_json(player_init, display_id, js_to_json) media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] feed = self._download_json( 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, clip_id, fatal=False) if feed: media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) if not media_id: media_id = self._download_json( 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( r'<title>([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] media_ids = [] for media_id_re in ( r']+src="[^"]+?mediaId=(\d+)"', r']+\bid=["\']player-(\d+)', r'guid["\']\s*:\s*["\'](\d+)'): media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in orderedSet(media_ids)]) return self.playlist_result( entries, display_id, strip_or_none(title), self._og_search_description(webpage)) class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', 'info_dict': { 'id': '2683190193', 'ext': 'mp4', 'title': 'Gerry Runs a Sweat Shop', 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', 'timestamp': 1455071400, 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, 'skip': 'Geo-restricted to Canada', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', 'info_dict': { 'id': '2657631896', 'ext': 'mp3', 'title': 'CBC Montreal is organizing its first ever community hackathon!', 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', 'timestamp': 1425704400, 'upload_date': '20150307', 'uploader': 'CBCC-NEW', }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, 'upload_date': '20111104', 'uploader': 'CBCC-NEW', }, }] def _real_extract(self, url): video_id = self._match_id(url) return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { 'force_smil_url': True }), 'id': video_id, } class CBCWatchBaseIE(InfoExtractor): _device_id = None _device_token = None _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' _NS_MAP = { 'media': 'http://search.yahoo.com/mrss/', 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } _GEO_COUNTRIES = ['CA'] _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' _NETRC_MACHINE = 'cbcwatch' def _signature(self, email, password): data = json.dumps({ 'email': email, 'password': password, }).encode() headers = {'content-type': 'application/json'} query = {'apikey': self._API_KEY} resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) access_token = resp['access_token'] # token query = { 'access_token': access_token, 'apikey': self._API_KEY, 'jwtapp': 'jwt', } resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) return resp['signature'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path for _ in range(2): try: result = self._download_xml(url, video_id, headers={ 'X-Clearleap-DeviceId': self._device_id, 'X-Clearleap-DeviceToken': self._device_token, }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: # Device token has expired, re-acquiring device token self._register_device() continue raise error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') if error_message: raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) return result def _real_initialize(self): if self._valid_device_token(): return device = self._downloader.cache.load( 'cbcwatch', self._cache_device_key()) or {} self._device_id, self._device_token = device.get('id'), device.get('token') if self._valid_device_token(): return self._register_device() def _valid_device_token(self): return self._device_id and self._device_token def _cache_device_key(self): email, _ = self._get_login_info() return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' def _register_device(self): result = self._download_xml( self._API_BASE_URL + 'device/register', None, 'Acquiring device token', data=b'web') self._device_id = xpath_text(result, 'deviceId', fatal=True) email, password = self._get_login_info() if email and password: signature = self._signature(email, password) data = '{0}{1}web'.format( escape(signature), escape(self._device_id)).encode() url = self._API_BASE_URL + 'device/login' result = self._download_xml( url, None, data=data, headers={'content-type': 'application/xml'}) self._device_token = xpath_text(result, 'token', fatal=True) else: self._device_token = xpath_text(result, 'deviceToken', fatal=True) self._downloader.cache.store( 'cbcwatch', self._cache_device_key(), { 'id': self._device_id, 'token': self._device_token, }) def _parse_rss_feed(self, rss): channel = xpath_element(rss, 'channel', fatal=True) def _add_ns(path): return xpath_with_ns(path, self._NS_MAP) entries = [] for item in channel.findall('item'): guid = xpath_text(item, 'guid', fatal=True) title = xpath_text(item, 'title', fatal=True) media_group = xpath_element(item, _add_ns('media:group'), fatal=True) content = xpath_element(media_group, _add_ns('media:content'), fatal=True) content_url = content.attrib['url'] thumbnails = [] for thumbnail in media_group.findall(_add_ns('media:thumbnail')): thumbnail_url = thumbnail.get('url') if not thumbnail_url: continue thumbnails.append({ 'id': thumbnail.get('profile'), 'url': thumbnail_url, 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) timestamp = None release_date = find_xpath_attr( item, _add_ns('media:credit'), 'role', 'releaseDate') if release_date is not None: timestamp = parse_iso8601(release_date.text) entries.append({ '_type': 'url_transparent', 'url': content_url, 'id': guid, 'title': title, 'description': xpath_text(item, 'description'), 'timestamp': timestamp, 'duration': int_or_none(content.get('duration')), 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), 'episode': xpath_text(item, _add_ns('clearleap:episode')), 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), 'series': xpath_text(item, _add_ns('clearleap:series')), 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), 'thumbnails': thumbnails, 'ie_key': 'CBCWatchVideo', }) return self.playlist_result( entries, xpath_text(channel, 'guid'), xpath_text(channel, 'title'), xpath_text(channel, 'description')) class CBCWatchVideoIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch:video' _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TEST = { # geo-restricted to Canada, bypassable 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', 'only_matching': True, } def _real_extract(self, url): video_id = self._match_id(url) result = self._call_api(url, video_id) m3u8_url = xpath_text(result, 'url', fatal=True) formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) if len(formats) < 2: formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') for f in formats: format_id = f.get('format_id') if format_id.startswith('AAC'): f['acodec'] = 'aac' elif format_id.startswith('AC3'): f['acodec'] = 'ac-3' self._sort_formats(formats) info = { 'id': video_id, 'title': video_id, 'formats': formats, } rss = xpath_element(result, 'rss') if rss: info.update(self._parse_rss_feed(rss)['entries'][0]) del info['url'] del info['_type'] del info['ie_key'] return info class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' _TESTS = [{ # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', 'info_dict': { 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', 'ext': 'mp4', 'title': 'Customer (Dis)Service', 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', 'upload_date': '20160219', 'timestamp': 1455840000, }, 'params': { # m3u8 download 'skip_download': True, 'format': 'bestvideo', }, }, { # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', 'info_dict': { 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', 'title': 'Arthur', 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, }, { 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) rss = self._call_api('web/browse/' + video_id, video_id) return self._parse_rss_feed(rss) class CBCOlympicsIE(InfoExtractor): IE_NAME = 'cbc.ca:olympics' _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._hidden_inputs(webpage)['videoId'] video_doc = self._download_xml( 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) title = xpath_text(video_doc, 'title', fatal=True) is_live = xpath_text(video_doc, 'kind') == 'Live' if is_live: title = self._live_title(title) formats = [] for video_source in video_doc.findall('videoSources/videoSource'): uri = xpath_text(video_source, 'uri') if not uri: continue tokenize = self._download_json( 'https://olympics.cbc.ca/api/api-akamai/tokenize', video_id, data=json.dumps({ 'VideoSource': uri, }).encode(), headers={ 'Content-Type': 'application/json', 'Referer': url, # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie }, fatal=False) if not tokenize: continue content_url = tokenize['ContentUrl'] video_source_format = video_source.get('format') if video_source_format == 'IIS': formats.extend(self._extract_ism_formats( content_url, video_id, ism_id=video_source_format, fatal=False)) else: formats.extend(self._extract_m3u8_formats( content_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', m3u8_id=video_source_format, fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': xpath_text(video_doc, 'description'), 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), 'duration': parse_duration(xpath_text(video_doc, 'duration')), 'formats': formats, 'is_live': is_live, } ================================================ FILE: youtube_dl/extractor/cbs.py ================================================ from __future__ import unicode_literals from .theplatform import ThePlatformFeedIE from ..utils import ( ExtractorError, int_or_none, find_xpath_attr, xpath_element, xpath_text, update_url_query, ) class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): subtitles = {} for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) if cc_e is not None: cc_url = cc_e.get('value') if cc_url: subtitles.setdefault(subtitles_lang, []).append({ 'ext': ext, 'url': cc_url, }) return subtitles class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, 'timestamp': 1385585425, 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, 'params': { # m3u8 download 'skip_download': True, }, '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }, { 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): items_data = self._download_xml( 'http://can.cbs.com/thunder/player/videoPlayerService.php', content_id, query={'partner': site, 'contentId': content_id}) video_data = xpath_element(items_data, './/item') title = xpath_text(video_data, 'videoTitle', 'title', True) tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) tp_release_url = 'http://link.theplatform.com/s/' + tp_path asset_types = [] subtitles = {} formats = [] last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type: continue asset_types.append(asset_type) query = { 'mbr': 'true', 'assetTypes': asset_type, } if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'): query['formats'] = 'MPEG4,M3U' elif asset_type in ('RTMP', 'WIFI', '3G'): query['formats'] = 'MPEG4,FLV' try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query(tp_release_url, query), content_id, 'Downloading %s SMIL data' % asset_type) except ExtractorError as e: last_e = e continue formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) if last_e and not formats: raise last_e self._sort_formats(formats) info = self._extract_theplatform_metadata(tp_path, content_id) info.update({ 'id': content_id, 'title': title, 'series': xpath_text(video_data, 'seriesTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': xpath_text(video_data, 'previewImageURL'), 'formats': formats, 'subtitles': subtitles, }) return info def _real_extract(self, url): content_id = self._match_id(url) return self._extract_video_info(content_id) ================================================ FILE: youtube_dl/extractor/cbsinteractive.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .cbs import CBSIE from ..utils import int_or_none class CBSInteractiveIE(CBSIE): _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', 'display_id': 'hands-on-with-microsofts-windows-8-1-update', 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, 'timestamp': 1396479627, 'upload_date': '20140402', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', 'info_dict': { 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', 'ext': 'mp4', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, 'timestamp': 1433289889, 'upload_date': '20150603', }, }, { 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', 'info_dict': { 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', 'ext': 'mp4', 'title': 'Video: Keeping Android smartphones and tablets secure', 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', 'uploader': 'Adrian Kingsley-Hughes', 'duration': 731, 'timestamp': 1449129925, 'upload_date': '20151203', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', 'only_matching': True, }] MPX_ACCOUNTS = { 'cnet': 2198311517, 'zdnet': 2387448114, } def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] video_id = vdata['mpxRefId'] title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) uploader_id = author.get('id') else: uploader = None uploader_id = None info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) info.update({ 'id': video_id, 'display_id': display_id, 'title': title, 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, }) return info ================================================ FILE: youtube_dl/extractor/cbslocal.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse from ..utils import ( parse_iso8601, unified_timestamp, ) class CBSLocalIE(AnvatoIE): _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' _TESTS = [{ 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', 'info_dict': { 'id': '3580809', 'ext': 'mp4', 'title': 'A Very Blue Anniversary', 'description': 'CBS2’s Cindy Hsu has more.', 'thumbnail': 're:^https?://.*', 'timestamp': int, 'upload_date': r're:^\d{8}$', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', }, 'categories': [ 'Stations\\Spoken Word\\WCBSTV', 'Syndication\\AOL', 'Syndication\\MSN', 'Syndication\\NDN', 'Syndication\\Yahoo', 'Content\\News', 'Content\\News\\Local News', ], 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): mcp_id = self._match_id(url) return self.url_result( 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) class CBSLocalArticleIE(AnvatoIE): _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P[0-9a-z-]+)' _TESTS = [{ # Anvato backend 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', 'md5': 'f0ee3081e3843f575fccef901199b212', 'info_dict': { 'id': '3401037', 'ext': 'mp4', 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', 'thumbnail': 're:^https?://.*', 'timestamp': 1463440500, 'upload_date': '20160516', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', }, 'categories': [ 'Stations\\Spoken Word\\KCBSTV', 'Syndication\\MSN', 'Syndication\\NDN', 'Syndication\\AOL', 'Syndication\\Yahoo', 'Syndication\\Tribune', 'Syndication\\Curb.tv', 'Content\\News' ], 'tags': ['CBS 2 News Evening'], }, }, { # SendtoNews embed 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588', }, 'playlist_count': 9, 'params': { # m3u8 download 'skip_download': True, }, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: return self.url_result( compat_urlparse.urljoin(url, sendtonews_url), ie=SendtoNewsIE.ie_key()) info_dict = self._extract_anvato_videos(webpage, display_id) timestamp = unified_timestamp(self._html_search_regex( r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, 'released date', default=None)) or parse_iso8601( self._html_search_meta('uploadDate', webpage)) info_dict.update({ 'display_id': display_id, 'timestamp': timestamp, }) return info_dict ================================================ FILE: youtube_dl/extractor/cbsnews.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re import zlib from .common import InfoExtractor from .cbs import CBSIE from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, ) from ..utils import ( parse_duration, ) class CBSNewsEmbedIE(CBSIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', 'only_matching': True, }] def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') class CBSNewsIE(CBSIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P[\da-z_-]+)' _TESTS = [ { # 60 minutes 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', 'info_dict': { 'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4', 'ext': 'flv', 'title': 'Artificial Intelligence, real-life applications', 'description': 'md5:a7aaf27f1b4777244de8b0b442289304', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 317, 'uploader': 'CBSI-NEW', 'timestamp': 1476046464, 'upload_date': '20161009', }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ 'ext': 'ttml', }], }, }, 'params': { # m3u8 download 'skip_download': True, }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, ] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) entries = [] for embed_url in re.findall(r']+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) if entries: return self.playlist_result( entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) item = self._parse_json(self._html_search_regex( r'CBSNEWS\.defaultPayload\s*=\s*({.+})', webpage, 'video JSON info'), display_id)['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') class CBSNewsLiveVideoIE(InfoExtractor): IE_NAME = 'cbsnews:livevideo' IE_DESC = 'CBS News Live Videos' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', 'ext': 'mp4', 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, 'skip': 'Video gone', } def _real_extract(self, url): display_id = self._match_id(url) video_info = self._download_json( 'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={ 'device': 'desktop', 'dvr_slug': display_id, }) formats = self._extract_akamai_formats(video_info['url'], display_id) self._sort_formats(formats) return { 'id': display_id, 'display_id': display_id, 'title': video_info['headline'], 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), 'duration': parse_duration(video_info.get('segmentDur')), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/cbssports.py ================================================ from __future__ import unicode_literals import re # from .cbs import CBSBaseIE from .common import InfoExtractor from ..utils import ( int_or_none, try_get, ) # class CBSSportsEmbedIE(CBSBaseIE): class CBSSportsEmbedIE(InfoExtractor): IE_NAME = 'cbssports:embed' _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? (?: ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| pcid%3D(?P\d+) )''' _TESTS = [{ 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', 'only_matching': True, }, { 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] # def _extract_video_info(self, filter_query, video_id): # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) def _real_extract(self, url): uuid, pcid = re.match(self._VALID_URL, url).groups() query = {'id': uuid} if uuid else {'pcid': pcid} video = self._download_json( 'https://www.cbssports.com/api/content/video/', uuid or pcid, query=query)[0] video_id = video['id'] title = video['title'] metadata = video.get('metaData') or {} # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) formats = self._extract_m3u8_formats( metadata['files'][0]['url'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(formats) image = video.get('image') thumbnails = None if image: image_path = image.get('path') if image_path: thumbnails = [{ 'url': image_path, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), 'filesize': int_or_none(image.get('size')), }] return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnails': thumbnails, 'description': video.get('description'), 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), 'duration': int_or_none(metadata.get('duration')), } class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) iframe_url = self._search_regex( r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', webpage, 'embed url') return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) class CBSSportsIE(CBSSportsBaseIE): IE_NAME = 'cbssports' _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', 'info_dict': { 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', 'ext': 'mp4', 'title': 'Cover 3: Stanford Spring Gleaning', 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', 'timestamp': 1617218398, 'upload_date': '20210331', 'duration': 502, }, }] class TwentyFourSevenSportsIE(CBSSportsBaseIE): IE_NAME = '247sports' _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' _TESTS = [{ 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', 'info_dict': { 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', 'ext': 'mp4', 'title': '2021 QB Jake Garcia senior highlights through five games', 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', 'timestamp': 1607114223, 'upload_date': '20201204', 'duration': 208, }, }] ================================================ FILE: youtube_dl/extractor/ccc.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, try_get, url_or_none, ) class CCCIE(InfoExtractor): IE_NAME = 'media.ccc.de' _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', 'creator': 'byterazor', 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20131228', 'timestamp': 1388188800, 'duration': 3710, 'tags': list, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id') event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) formats = [] for recording in event_data.get('recordings', []): recording_url = recording.get('recording_url') if not recording_url: continue language = recording.get('language') folder = recording.get('folder') format_id = None if language: format_id = language if folder: if language: format_id += '-' + folder else: format_id = folder vcodec = 'h264' if 'h264' in folder else ( 'none' if folder in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, 'url': recording_url, 'width': int_or_none(recording.get('width')), 'height': int_or_none(recording.get('height')), 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), 'language': language, 'vcodec': vcodec, }) self._sort_formats(formats) return { 'id': event_id, 'display_id': display_id, 'title': event_data['title'], 'creator': try_get(event_data, lambda x: ', '.join(x['persons'])), 'description': event_data.get('description'), 'thumbnail': event_data.get('thumb_url'), 'timestamp': parse_iso8601(event_data.get('date')), 'duration': int_or_none(event_data.get('length')), 'tags': event_data.get('tags'), 'formats': formats, } class CCCPlaylistIE(InfoExtractor): IE_NAME = 'media.ccc.de:lists' _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://media.ccc.de/c/30c3', 'info_dict': { 'title': '30C3', 'id': '30c3', }, 'playlist_count': 135, }] def _real_extract(self, url): playlist_id = self._match_id(url).lower() conf = self._download_json( 'https://media.ccc.de/public/conferences/' + playlist_id, playlist_id) entries = [] for e in conf['events']: event_url = url_or_none(e.get('frontend_link')) if event_url: entries.append(self.url_result(event_url, ie=CCCIE.ie_key())) return self.playlist_result(entries, playlist_id, conf.get('title')) ================================================ FILE: youtube_dl/extractor/ccma.py ================================================ # coding: utf-8 from __future__ import unicode_literals import calendar import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, extract_timezone, int_or_none, parse_duration, parse_resolution, try_get, url_or_none, ) class CCMAIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' _TESTS = [{ 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', 'md5': '7296ca43977c8ea4469e719c609b0871', 'info_dict': { 'id': '5630208', 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', 'timestamp': 1478608140, 'upload_date': '20161108', 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', 'md5': 'fa3e38f269329a278271276330261425', 'info_dict': { 'id': '943685', 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', 'upload_date': '20170512', 'timestamp': 1494622500, 'vcodec': 'none', 'categories': ['Esports'], } }, { 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', 'info_dict': { 'id': '6031387', 'ext': 'mp4', 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', 'timestamp': 1582577700, 'upload_date': '20200224', 'subtitles': 'mincount:4', 'age_limit': 16, 'series': 'Crims', } }] def _real_extract(self, url): media_type, media_id = re.match(self._VALID_URL, url).groups() media = self._download_json( 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, }) formats = [] media_url = media['media']['url'] if isinstance(media_url, list): for format_ in media_url: format_url = url_or_none(format_.get('file')) if not format_url: continue label = format_.get('label') f = parse_resolution(label) f.update({ 'url': format_url, 'format_id': label, }) formats.append(f) else: formats.append({ 'url': media_url, 'vcodec': 'none' if media_type == 'audio' else None, }) self._sort_formats(formats) informacio = media['informacio'] title = informacio['titol'] durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) tematica = try_get(informacio, lambda x: x['tematica']['text']) timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) try: timezone, data_utc = extract_timezone(data_utc) timestamp = calendar.timegm((datetime.datetime.strptime( data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) except TypeError: pass subtitles = {} subtitols = media.get('subtitols') or [] if isinstance(subtitols, dict): subtitols = [subtitols] for st in subtitols: sub_url = st.get('url') if sub_url: subtitles.setdefault( st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) thumbnails = [] imatges = media.get('imatges', {}) if imatges: thumbnail_url = imatges.get('url') if thumbnail_url: thumbnails = [{ 'url': thumbnail_url, 'width': int_or_none(imatges.get('amplada')), 'height': int_or_none(imatges.get('alcada')), }] age_limit = None codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) if codi_etic: codi_etic_s = codi_etic.split('_') if len(codi_etic_s) == 2: if codi_etic_s[1] == 'TP': age_limit = 0 else: age_limit = int_or_none(codi_etic_s[1]) return { 'id': media_id, 'title': title, 'description': clean_html(informacio.get('descripcio')), 'duration': duration, 'timestamp': timestamp, 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, 'age_limit': age_limit, 'alt_title': informacio.get('titol_complet'), 'episode_number': int_or_none(informacio.get('capitol')), 'categories': [tematica] if tematica else None, 'series': informacio.get('programa'), } ================================================ FILE: youtube_dl/extractor/cctv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, try_get, unified_timestamp, ) class CCTVIE(InfoExtractor): IE_DESC = '央视网' _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)' _TESTS = [{ # fo.addVariable("videoCenterId","id") 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml', 'md5': 'd61ec00a493e09da810bf406a078f691', 'info_dict': { 'id': '5ecdbeab623f4973b40ff25f18b174e8', 'ext': 'mp4', 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)', 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95', 'duration': 98, 'uploader': 'songjunjie', 'timestamp': 1455279956, 'upload_date': '20160212', }, }, { # var guid = "id" 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml', 'info_dict': { 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae', 'ext': 'mp4', 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)', 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。', 'duration': 37, 'uploader': 'shujun', 'timestamp': 1454677291, 'upload_date': '20160205', }, 'params': { 'skip_download': True, }, }, { # changePlayer('id') 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml', 'info_dict': { 'id': '4bb9bb4db7a6471ba85fdeda5af0381e', 'ext': 'mp4', 'title': 'NHnews008 ANNUAL POLITICAL SEASON', 'description': 'Four Comprehensives', 'duration': 60, 'uploader': 'zhangyunlei', 'timestamp': 1425385521, 'upload_date': '20150303', }, 'params': { 'skip_download': True, }, }, { # loadvideo('id') 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml', 'info_dict': { 'id': 'b15f009ff45c43968b9af583fc2e04b2', 'ext': 'mp4', 'title': 'Путь,усыпанный космеями Серия 1', 'description': 'Путь, усыпанный космеями', 'duration': 2645, 'uploader': 'renxue', 'timestamp': 1477479241, 'upload_date': '20161026', }, 'params': { 'skip_download': True, }, }, { # var initMyAray = 'id' 'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml', 'info_dict': { 'id': 'a194cfa7f18c426b823d876668325946', 'ext': 'mp4', 'title': '小泽征尔音乐塾 音乐梦想无国界', 'duration': 2173, 'timestamp': 1369248264, 'upload_date': '20130522', }, 'params': { 'skip_download': True, }, }, { # var ids = ["id"] 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', 'info_dict': { 'id': 'a8606119a4884588a79d81c02abecc16', 'ext': 'mp3', 'title': '来自维也纳的新年贺礼', 'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7', 'duration': 1578, 'uploader': 'djy', 'timestamp': 1482942419, 'upload_date': '20161228', }, 'params': { 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml', 'only_matching': True, }, { 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44', 'only_matching': True, }, { 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', 'only_matching': True, }, { 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', 'only_matching': True, }, { 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_id = self._search_regex( [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'], webpage, 'video id') data = self._download_json( 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id, query={ 'pid': video_id, 'url': url, 'idl': 32, 'idlr': 32, 'modifyed': 'false', }) title = data['title'] formats = [] video = data.get('video') if isinstance(video, dict): for quality, chapters_key in enumerate(('lowChapters', 'chapters')): video_url = try_get( video, lambda x: x[chapters_key][0]['url'], compat_str) if video_url: formats.append({ 'url': video_url, 'format_id': 'http', 'quality': quality, 'preference': -1, }) hls_url = try_get(data, lambda x: x['hls_url'], compat_str) if hls_url: hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) formats.extend(self._extract_m3u8_formats( hls_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) uploader = data.get('editer_name') description = self._html_search_meta( 'description', webpage, default=None) timestamp = unified_timestamp(data.get('f_pgmtime')) duration = float_or_none(try_get(video, lambda x: x['totalLength'])) return { 'id': video_id, 'title': title, 'description': description, 'uploader': uploader, 'timestamp': timestamp, 'duration': duration, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/cda.py ================================================ # coding: utf-8 from __future__ import unicode_literals import codecs import re from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, compat_urllib_parse_unquote, ) from ..utils import ( ExtractorError, float_or_none, int_or_none, merge_dicts, multipart_encode, parse_duration, random_birthday, urljoin, ) class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' _BASE_URL = 'http://www.cda.pl/' _TESTS = [{ 'url': 'http://www.cda.pl/video/5749950c', 'md5': '6f844bf51b15f31fae165365707ae970', 'info_dict': { 'id': '5749950c', 'ext': 'mp4', 'height': 720, 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', 'description': 'md5:269ccd135d550da90d1662651fcb9772', 'thumbnail': r're:^https?://.*\.jpg$', 'average_rating': float, 'duration': 39, 'age_limit': 0, } }, { 'url': 'http://www.cda.pl/video/57413289', 'md5': 'a88828770a8310fc00be6c95faf7f4d5', 'info_dict': { 'id': '57413289', 'ext': 'mp4', 'title': 'Lądowanie na lotnisku na Maderze', 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, } }, { # Age-restricted 'url': 'http://www.cda.pl/video/1273454c4', 'info_dict': { 'id': '1273454c4', 'ext': 'mp4', 'title': 'Bronson (2008) napisy HD 1080p', 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', 'height': 1080, 'uploader': 'boniek61', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 5554, 'age_limit': 18, 'view_count': int, 'average_rating': float, }, }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): form_data = random_birthday('rok', 'miesiac', 'dzien') form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) data, content_type = multipart_encode(form_data) return self._download_webpage( urljoin(url, '/a/validatebirth'), video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, }, **kwargs) def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('cda.pl', 'cda.player', 'html5') webpage = self._download_webpage( self._BASE_URL + '/video/' + video_id, video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): self.raise_geo_restricted() need_confirm_age = False if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( url, video_id, note='Confirming age') need_confirm_age = True formats = [] uploader = self._search_regex(r'''(?x) <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) ''', webpage, 'uploader', default=None, group='uploader') view_count = self._search_regex( r'Odsłony:(?:\s| )*([0-9]+)', webpage, 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, group='rating_value') info_dict = { 'id': video_id, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, 'age_limit': 18 if need_confirm_age else 0, } info = self._search_json_ld(webpage, video_id, default={}) # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): a = a.replace(p, '') a = compat_urllib_parse_unquote(a) b = [] for c in a: f = compat_ord(c) b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) a = ''.join(b) a = a.replace('.cda.mp4', '') for p in ('.2cda.pl', '.3cda.pl'): a = a.replace(p, '.cda.pl') if '/upstream' in a: a = a.replace('/upstream', '.mp4/upstream') return 'https://' + a return 'https://' + a + '.mp4' def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P.+?)\1', page, '%s player_json' % version, fatal=False, group='player_data') if not json_str: return player_data = self._parse_json( json_str, '%s player_data' % version, fatal=False) if not player_data: return video = player_data.get('video') if not video or 'file' not in video: self.report_warning('Unable to extract %s version information' % version) return if video['file'].startswith('uggc'): video['file'] = codecs.decode(video['file'], 'rot_13') if video['file'].endswith('adc.mp4'): video['file'] = video['file'].replace('adc.mp4', '.mp4') elif not video['file'].startswith('http'): video['file'] = decrypt_file(video['file']) f = { 'url': video['file'], } m = re.search( r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', page) if m: f.update({ 'format_id': m.group('format_id'), 'height': int(m.group('height')), }) info_dict['formats'].append(f) if not info_dict['duration']: info_dict['duration'] = parse_duration(video.get('duration')) extract_format(webpage, 'default') for href, resolution in re.findall( r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): if need_confirm_age: handler = self._download_age_confirm_page else: handler = self._download_webpage webpage = handler( urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when # invalid version is requested. self.report_warning('Unable to download %s version information' % resolution) continue extract_format(webpage, resolution) self._sort_formats(formats) return merge_dicts(info_dict, info) ================================================ FILE: youtube_dl/extractor/ceskatelevize.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, float_or_none, sanitized_Request, str_or_none, traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', 'ext': 'mp4', 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, }, 'params': { # m3u8 download 'skip_download': True, }, }, { # live stream 'url': 'http://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { 'id': '102', 'ext': 'mp4', 'title': r'ČT1 - živé vysílání online', 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, }, { # another 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'only_matching': True, 'info_dict': { 'id': 402, 'ext': 'mp4', 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, # 'skip': 'Georestricted to Czech Republic', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, }, { # video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', 'info_dict': { 'id': '215562210900007-bogotart', 'title': 'Bogotart - Queer', 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', }, 'playlist': [{ 'info_dict': { 'id': '61924494877311053', 'ext': 'mp4', 'title': 'Bogotart - Queer (Varování 18+)', 'duration': 11.9, }, }, { 'info_dict': { 'id': '61924494877068022', 'ext': 'mp4', 'title': 'Bogotart - Queer (Queer)', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1558.3, }, }], 'params': { # m3u8 download 'skip_download': True, }, }, { # iframe embed 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', 'only_matching': True, }] def _search_nextjs_data(self, webpage, video_id, **kw): return self._parse_json( self._search_regex( r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', webpage, 'next.js data', **kw), video_id, **kw) def _real_extract(self, url): playlist_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, playlist_id) parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] playlist_description = self._og_search_description(webpage, default=None) if playlist_description: playlist_description = playlist_description.replace('\xa0', ' ') type_ = 'IDEC' if re.search(r'(^/porady|/zive)/', parsed_url.path): next_data = self._search_nextjs_data(webpage, playlist_id) if '/zive/' in parsed_url.path: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) else: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) if not idec: idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) if idec: type_ = 'bonus' if not idec: raise ExtractorError('Failed to find IDEC id') iframe_hash = self._download_webpage( 'https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id, note='Getting IFRAME hash') query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } webpage = self._download_webpage( 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: self.raise_geo_restricted(NOT_AVAILABLE_STRING) if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None episode_id = None playlist = self._parse_json( self._search_regex( r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', default='{}'), playlist_id) if playlist: type_ = playlist.get('type') episode_id = playlist.get('id') if not type_: type_ = self._html_search_regex( r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') if not episode_id: episode_id = self._html_search_regex( r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } entries = [] for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') req.add_header('X-Requested-With', 'XMLHttpRequest') if user_agent: req.add_header('User-Agent', user_agent) req.add_header('Referer', url) playlistpage = self._download_json(req, playlist_id, fatal=False) if not playlistpage: continue playlist_url = playlistpage['url'] if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue playlist = playlist.get('playlist') if not isinstance(playlist, list): continue playlist_len = len(playlist) for num, item in enumerate(playlist): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): if 'drmOnly=true' in stream_url: continue if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', m3u8_id='hls-%s' % format_id, fatal=False) else: stream_formats = self._extract_mpd_formats( stream_url, playlist_id, mpd_id='dash-%s' % format_id, fatal=False) # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 if format_id == 'audioDescription': for f in stream_formats: f['source_preference'] = -10 formats.extend(stream_formats) if user_agent and len(entries) == playlist_len: entries[num]['formats'].extend(formats) continue item_id = str_or_none(item.get('id') or item['assetId']) title = item['title'] duration = float_or_none(item.get('duration')) thumbnail = item.get('previewImageUrl') subtitles = {} if item.get('type') == 'VOD': subs = item.get('subtitles') if subs: subtitles = self.extract_subtitles(episode_id, subs) if playlist_len == 1: final_title = playlist_title or title else: final_title = '%s (%s)' % (playlist_title, title) entries.append({ 'id': item_id, 'title': final_title, 'description': playlist_description if playlist_len == 1 else None, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'subtitles': subtitles, 'is_live': is_live, }) for e in entries: self._sort_formats(e['formats']) if len(entries) == 1: return entries[0] return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): original_subtitles = self._download_webpage( subs[0]['url'], episode_id, 'Downloading subtitles') srt_subs = self._fix_subtitles(original_subtitles) return { 'cs': [{ 'ext': 'srt', 'data': srt_subs, }] } @staticmethod def _fix_subtitles(subtitles): """ Convert millisecond-based subtitles to SRT """ def _msectotimecode(msec): """ Helper utility to convert milliseconds to timecode """ components = [] for divider in [1000, 60, 60, 100]: components.append(msec % divider) msec //= divider return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components) def _fix_subtitle(subtitle): for line in subtitle.splitlines(): m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line) if m: yield m.group(1) start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) yield '{0} --> {1}'.format(start, stop) else: yield line return '\r\n'.join(_fix_subtitle(subtitles)) ================================================ FILE: youtube_dl/extractor/channel9.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError, int_or_none, parse_iso8601, qualities, unescapeHTML, ) class Channel9IE(InfoExtractor): IE_DESC = 'Channel 9' IE_NAME = 'channel9' _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P.+?)(?P/RSS)?/?(?:[?#&]|$)' _TESTS = [{ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', 'md5': '32083d4eaf1946db6d454313f44510ca', 'info_dict': { 'id': '6c413323-383a-49dc-88f9-a22800cab024', 'ext': 'wmv', 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', 'duration': 4576, 'thumbnail': r're:https?://.*\.jpg', 'timestamp': 1377717420, 'upload_date': '20130828', 'session_code': 'KOS002', 'session_room': 'Arena 1A', 'session_speakers': 'count:5', }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', 'info_dict': { 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', 'ext': 'wmv', 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', 'duration': 1540, 'thumbnail': r're:https?://.*\.jpg', 'timestamp': 1386381991, 'upload_date': '20131207', 'authors': ['Mike Wilmot'], }, }, { # low quality mp4 is best 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 'info_dict': { 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', 'ext': 'mp4', 'title': 'Ranges for the Standard Library', 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', 'duration': 5646, 'thumbnail': r're:https?://.*\.jpg', 'upload_date': '20150930', 'timestamp': 1443640735, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', 'info_dict': { 'id': 'Events/DEVintersection/DEVintersection-2016', 'title': 'DEVintersection 2016 Orlando Sessions', }, 'playlist_mincount': 14, }, { 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', 'only_matching': True, }, { 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', 'only_matching': True, }] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' @staticmethod def _extract_urls(webpage): return re.findall( r']+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', webpage) def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id rss = self._download_xml(rss_url, video_id, 'Downloading RSS') entries = [self.url_result(session_url.text, 'Channel9') for session_url in rss.findall('./channel/item/link')] title_text = rss.find('./channel/title').text return self.playlist_result(entries, video_id, title_text) def _real_extract(self, url): content_path, rss = re.match(self._VALID_URL, url).groups() if rss: return self._extract_list(content_path, url) webpage = self._download_webpage( url, content_path, 'Downloading web page') episode_data = self._search_regex( r"data-episode='([^']+)'", webpage, 'episode data', default=None) if episode_data: episode_data = self._parse_json(unescapeHTML( episode_data), content_path) content_id = episode_data['contentId'] is_session = '/Sessions(' in episode_data['api'] content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' if is_session: content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' else: content_url += 'Authors,Body&$expand=Authors' content_data = self._download_json(content_url, content_id) title = content_data['Title'] QUALITIES = ( 'mp3', 'wmv', 'mp4', 'wmv-low', 'mp4-low', 'wmv-mid', 'mp4-mid', 'wmv-high', 'mp4-high', ) quality_key = qualities(QUALITIES) def quality(quality_id, format_url): return (len(QUALITIES) if '_Source.' in format_url else quality_key(quality_id)) formats = [] urls = set() SITE_QUALITIES = { 'MP3': 'mp3', 'MP4': 'mp4', 'Low Quality WMV': 'wmv-low', 'Low Quality MP4': 'mp4-low', 'Mid Quality WMV': 'wmv-mid', 'Mid Quality MP4': 'mp4-mid', 'High Quality WMV': 'wmv-high', 'High Quality MP4': 'mp4-high', } formats_select = self._search_regex( r'(?s)]+name=["\']format[^>]+>(.+?)]+\bvalue=(["\'])(?P(?:(?!\1).)+)\1[^>]*>\s*(?P[^<]+?)\s*<', formats_select): format_url = mobj.group('url') if format_url in urls: continue urls.add(format_url) format_id = mobj.group('format') quality_id = SITE_QUALITIES.get(format_id, format_id) formats.append({ 'url': format_url, 'format_id': quality_id, 'quality': quality(quality_id, format_url), 'vcodec': 'none' if quality_id == 'mp3' else None, }) API_QUALITIES = { 'VideoMP4Low': 'mp4-low', 'VideoWMV': 'wmv-mid', 'VideoMP4Medium': 'mp4-mid', 'VideoMP4High': 'mp4-high', 'VideoWMVHQ': 'wmv-hq', } for format_id, q in API_QUALITIES.items(): q_url = content_data.get(format_id) if not q_url or q_url in urls: continue urls.add(q_url) formats.append({ 'url': q_url, 'format_id': q, 'quality': quality(q, q_url), }) self._sort_formats(formats) slides = content_data.get('Slides') zip_file = content_data.get('ZipFile') if not formats and not slides and not zip_file: raise ExtractorError( 'None of recording, slides or zip are available for %s' % content_path) subtitles = {} for caption in content_data.get('Captions', []): caption_url = caption.get('Url') if not caption_url: continue subtitles.setdefault(caption.get('Language', 'en'), []).append({ 'url': caption_url, 'ext': 'vtt', }) common = { 'id': content_id, 'title': title, 'description': clean_html(content_data.get('Description') or content_data.get('Body')), 'thumbnail': content_data.get('VideoPlayerPreviewImage'), 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), 'timestamp': parse_iso8601(content_data.get('PublishedDate')), 'avg_rating': int_or_none(content_data.get('Rating')), 'rating_count': int_or_none(content_data.get('RatingCount')), 'view_count': int_or_none(content_data.get('Views')), 'comment_count': int_or_none(content_data.get('CommentCount')), 'subtitles': subtitles, } if is_session: speakers = [] for s in content_data.get('Speakers', []): speaker_name = s.get('FullName') if not speaker_name: continue speakers.append(speaker_name) common.update({ 'session_code': content_data.get('Code'), 'session_room': content_data.get('Room'), 'session_speakers': speakers, }) else: authors = [] for a in content_data.get('Authors', []): author_name = a.get('DisplayName') if not author_name: continue authors.append(author_name) common['authors'] = authors contents = [] if slides: d = common.copy() d.update({'title': title + '-Slides', 'url': slides}) contents.append(d) if zip_file: d = common.copy() d.update({'title': title + '-Zip', 'url': zip_file}) contents.append(d) if formats: d = common.copy() d.update({'title': title, 'formats': formats}) contents.append(d) return self.playlist_result(contents) else: return self._extract_list(content_path) ================================================ FILE: youtube_dl/extractor/charlierose.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import remove_end class CharlieRoseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', 'info_dict': { 'id': '27996', 'ext': 'mp4', 'title': 'Remembering Zaha Hadid', 'thumbnail': r're:^https?://.*\.jpg\?\d+', 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', 'subtitles': { 'en': [{ 'ext': 'vtt', }], }, }, }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, }, { 'url': 'https://charlierose.com/episodes/30887?autoplay=true', 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') info_dict = self._parse_html5_media_entries( self._PLAYER_BASE % video_id, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] self._sort_formats(info_dict['formats']) self._remove_duplicate_formats(info_dict['formats']) info_dict.update({ 'id': video_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), }) return info_dict ================================================ FILE: youtube_dl/extractor/chaturbate.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, lowercase_escape, url_or_none, ) class ChaturbateIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://www.chaturbate.com/siswet19/', 'info_dict': { 'id': 'siswet19', 'ext': 'mp4', 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'age_limit': 18, 'is_live': True, }, 'params': { 'skip_download': True, }, 'skip': 'Room is offline', }, { 'url': 'https://chaturbate.com/fullvideo/?b=caylin', 'only_matching': True, }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, }] _ROOM_OFFLINE = 'Room is currently offline' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://chaturbate.com/%s/' % video_id, video_id, headers=self.geo_verification_headers()) found_m3u8_urls = [] data = self._parse_json( self._search_regex( r'initialRoomDossier\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'data', default='{}', group='value'), video_id, transform_source=lowercase_escape, fatal=False) if data: m3u8_url = url_or_none(data.get('hls_source')) if m3u8_url: found_m3u8_urls.append(m3u8_url) if not found_m3u8_urls: for m in re.finditer( r'(\\u002[27])(?Phttp.+?\.m3u8.*?)\1', webpage): found_m3u8_urls.append(lowercase_escape(m.group('url'))) if not found_m3u8_urls: for m in re.finditer( r'(["\'])(?Phttp.+?\.m3u8.*?)\1', webpage): found_m3u8_urls.append(m.group('url')) m3u8_urls = [] for found_m3u8_url in found_m3u8_urls: m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '') for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): if m3u8_url not in m3u8_urls: m3u8_urls.append(m3u8_url) if not m3u8_urls: error = self._search_regex( [r']+class=(["\'])desc_span\1[^>]*>(?P[^<]+)', r']+id=(["\'])defchat\1[^>]*>\s*

(?P[^<]+)<'], webpage, 'error', group='error', default=None) if not error: if any(p in webpage for p in ( self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): error = self._ROOM_OFFLINE if error: raise ExtractorError(error, expected=True) raise ExtractorError('Unable to find stream URL') formats = [] for m3u8_url in m3u8_urls: for known_id in ('fast', 'slow'): if '_%s' % known_id in m3u8_url: m3u8_id = known_id break else: m3u8_id = None formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', # ffmpeg skips segments for fast m3u8 preference=-10 if m3u8_id == 'fast' else None, m3u8_id=m3u8_id, fatal=False, live=True)) self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title(video_id), 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/chilloutzone.py ================================================ from __future__ import unicode_literals import re import json from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import compat_b64decode from ..utils import ( clean_html, ExtractorError ) class ChilloutzoneIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P[\w|-]+)\.html' _TESTS = [{ 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'md5': 'a76f3457e813ea0037e5244f509e66d1', 'info_dict': { 'id': 'enemene-meck-alle-katzen-weg', 'ext': 'mp4', 'title': 'Enemene Meck - Alle Katzen weg', 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', }, }, { 'note': 'Video hosted at YouTube', 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', 'info_dict': { 'id': '1YVQaAgHyRU', 'ext': 'mp4', 'title': '16 Photos Taken 1 Second Before Disaster', 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', 'uploader': 'BuzzFeedVideo', 'uploader_id': 'BuzzFeedVideo', 'upload_date': '20131105', }, }, { 'note': 'Video hosted at Vimeo', 'url': 'http://www.chilloutzone.net/video/icon-blending.html', 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', 'info_dict': { 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict video_url = video_info_dict['mediaUrl'] description = clean_html(video_info_dict.get('description')) title = video_info_dict['title'] native_platform = video_info_dict['nativePlatform'] native_video_id = video_info_dict['nativeVideoId'] source_priority = video_info_dict['sourcePriority'] # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN if source_priority == 'native': if native_platform == 'youtube': return self.url_result(native_video_id, ie='Youtube') if native_platform == 'vimeo': return self.url_result( 'http://vimeo.com/' + native_video_id, ie='Vimeo') if not video_url: raise ExtractorError('No video found') return { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': title, 'description': description, } ================================================ FILE: youtube_dl/extractor/chirbit.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import parse_duration class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/be2abG', 'info_dict': { 'id': 'be2abG', 'ext': 'mp3', 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', 'duration': 306, 'uploader': 'Gerryaudio', }, 'params': { 'skip_download': True, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, }, { 'url': 'https://chirb.it/wp/MN58c2', 'only_matching': True, }] def _real_extract(self, url): audio_id = self._match_id(url) webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) data_fd = self._search_regex( r'data-fd=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'data fd', group='url') # Reverse engineered from https://chirb.it/js/chirbit.player.js (look # for soundURL) audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') title = self._search_regex( r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') description = self._search_regex( r'

Description

\s*]*>([^<]+)', webpage, 'description', default=None) duration = parse_duration(self._search_regex( r'class=["\']c-length["\'][^>]*>([^<]+)', webpage, 'duration', fatal=False)) uploader = self._search_regex( r'id=["\']chirbit-username["\'][^>]*>([^<]+)', webpage, 'uploader', fatal=False) return { 'id': audio_id, 'url': audio_url, 'title': title, 'description': description, 'duration': duration, 'uploader': uploader, } class ChirbitProfileIE(InfoExtractor): IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'info_dict': { 'id': 'ScarletBeauty', }, 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) webpage = self._download_webpage(url, profile_id) entries = [ self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) for _, video_id in re.findall(r']+id=([\'"])copy-btn-(?P[0-9a-zA-Z]+)\1', webpage)] return self.playlist_result(entries, profile_id) ================================================ FILE: youtube_dl/extractor/cinchcast.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unified_strdate, xpath_text, ) class CinchcastIE(InfoExtractor): _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P[0-9]+)' _TESTS = [{ 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', 'info_dict': { 'id': '5258197', 'ext': 'mp3', 'title': 'Train Your Brain to Up Your Game with Coach Mandy', 'upload_date': '20130816', }, }, { # Actual test is run in generic, look for undergroundwellness 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) doc = self._download_xml( 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, video_id) item = doc.find('.//item') title = xpath_text(item, './title', fatal=True) date_str = xpath_text( item, './{http://developer.longtailvideo.com/trac/}date') upload_date = unified_strdate(date_str, day_first=False) # duration is present but wrong formats = [{ 'format_id': 'main', 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], }] backup_url = xpath_text( item, './{http://developer.longtailvideo.com/trac/}backupContent') if backup_url: formats.append({ 'preference': 2, # seems to be more reliable 'format_id': 'backup', 'url': backup_url, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'upload_date': upload_date, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/cinemax.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .hbo import HBOBaseIE class CinemaxIE(HBOBaseIE): _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P[^/]+/video/[0-9a-z-]+-(?P\d+))' _TESTS = [{ 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', 'info_dict': { 'id': '20126903', 'ext': 'mp4', 'title': 'S1 Ep 1: Recap', }, 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], }, { 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', 'only_matching': True, }] def _real_extract(self, url): path, video_id = re.match(self._VALID_URL, url).groups() info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) info['id'] = video_id return info ================================================ FILE: youtube_dl/extractor/ciscolive.py ================================================ # coding: utf-8 from __future__ import unicode_literals import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( clean_html, float_or_none, int_or_none, try_get, urlencode_postdata, ) class CiscoLiveBaseIE(InfoExtractor): # These appear to be constant across all Cisco Live presentations # and are not tied to any user session or event RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' HEADERS = { 'Origin': 'https://ciscolive.cisco.com', 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, 'rfWidgetId': RAINFOCUS_WIDGET_ID, } def _call_api(self, ep, rf_id, query, referrer, note=None): headers = self.HEADERS.copy() headers['Referer'] = referrer return self._download_json( self.RAINFOCUS_API_URL % ep, rf_id, note=note, data=urlencode_postdata(query), headers=headers) def _parse_rf_item(self, rf_item): event_name = rf_item.get('eventName') title = rf_item['title'] description = clean_html(rf_item.get('abstract')) presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) bc_id = rf_item['videos'][0]['url'] bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) location = try_get(rf_item, lambda x: x['times'][0]['room']) if duration: duration = duration * 60 return { '_type': 'url_transparent', 'url': bc_url, 'ie_key': 'BrightcoveNew', 'title': title, 'description': description, 'duration': duration, 'creator': presenter_name, 'location': location, 'series': event_name, } class CiscoLiveSessionIE(CiscoLiveBaseIE): _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P[^/?&]+)' _TESTS = [{ 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', 'md5': 'c98acf395ed9c9f766941c70f5352e22', 'info_dict': { 'id': '5803694304001', 'ext': 'mp4', 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', 'description': 'md5:ec4a436019e09a918dec17714803f7cc', 'timestamp': 1530305395, 'upload_date': '20180629', 'uploader_id': '5647924234001', 'location': '16B Mezz.', }, }, { 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', 'only_matching': True, }, { 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', 'only_matching': True, }] def _real_extract(self, url): rf_id = self._match_id(url) rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) return self._parse_rf_item(rf_result['items'][0]) class CiscoLiveSearchIE(CiscoLiveBaseIE): _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' _TESTS = [{ 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', 'info_dict': { 'title': 'Search query', }, 'playlist_count': 5, }, { 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', 'only_matching': True, }, { 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) @staticmethod def _check_bc_id_exists(rf_item): return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None def _entries(self, query, url): query['size'] = 50 query['from'] = 0 for page_num in itertools.count(1): results = self._call_api( 'search', None, query, url, 'Downloading search JSON page %d' % page_num) sl = try_get(results, lambda x: x['sectionList'][0], dict) if sl: results = sl items = results.get('items') if not items or not isinstance(items, list): break for item in items: if not isinstance(item, dict): continue if not self._check_bc_id_exists(item): continue yield self._parse_rf_item(item) size = int_or_none(results.get('size')) if size is not None: query['size'] = size total = int_or_none(results.get('total')) if total is not None and query['from'] + query['size'] > total: break query['from'] += query['size'] def _real_extract(self, url): query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) query['type'] = 'session' return self.playlist_result( self._entries(query, url), playlist_title='Search query') ================================================ FILE: youtube_dl/extractor/cjsw.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, unescapeHTML, ) class CJSWIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P[^/]+)/episode/(?P\d+)' _TESTS = [{ 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', 'md5': 'cee14d40f1e9433632c56e3d14977120', 'info_dict': { 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', 'ext': 'mp3', 'title': 'Freshly Squeezed – Episode June 20, 2017', 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', 'series': 'Freshly Squeezed', 'episode_id': '20170620', }, }, { # no description 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) program, episode_id = mobj.group('program', 'id') audio_id = '%s/%s' % (program, episode_id) webpage = self._download_webpage(url, episode_id) title = unescapeHTML(self._search_regex( (r']+class=["\']episode-header__title["\'][^>]*>(?P[^<]+)', r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title')) audio_url = self._search_regex( r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'audio url', group='url') audio_id = self._search_regex( r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', audio_url, 'audio id', default=audio_id) formats = [{ 'url': audio_url, 'ext': determine_ext(audio_url, 'mp3'), 'vcodec': 'none', }] description = self._html_search_regex( r'<p>(?P<description>.+?)</p>', webpage, 'description', default=None) series = self._search_regex( r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, 'series', default=program, group='name') return { 'id': audio_id, 'title': title, 'description': description, 'formats': formats, 'series': series, 'episode_id': episode_id, } ================================================ FILE: youtube_dl/extractor/clipchamp.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, merge_dicts, T, traverse_obj, unified_timestamp, url_or_none, ) class ClipchampIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', 'info_dict': { 'id': 'gRXZ4ZhdDaU', 'ext': 'mp4', 'title': 'Untitled video', 'uploader': 'Alexander Schwartz', 'timestamp': 1680805580, 'upload_date': '20230406', 'thumbnail': r're:^https?://.+\.jpg', }, 'params': { 'skip_download': 'm3u8', 'format': 'bestvideo', }, }] _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] storage_location = data.get('storage_location') if storage_location != 'cf_stream': raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,)) path = data['download_url'] iframe = self._download_webpage( 'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe') subdomain = self._search_regex( r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe, 'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe' formats = self._extract_mpd_formats( self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') formats.extend(self._extract_m3u8_formats( self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) return merge_dicts({ 'id': video_id, 'formats': formats, 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None, }, traverse_obj(data, { 'title': ('project', 'project_name', T(compat_str)), 'timestamp': ('created_at', T(unified_timestamp)), 'thumbnail': ('thumbnail_url', T(url_or_none)), }), rev=True) ================================================ FILE: youtube_dl/extractor/cliphunter.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, url_or_none, ) class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P<id>[0-9]+)/ (?P<seo>.+?)(?:$|[#\?]) ''' _TESTS = [{ 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', 'info_dict': { 'id': '1012420', 'ext': 'flv', 'title': 'Fun Jynx Maze solo', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, 'skip': 'Video gone', }, { 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', 'info_dict': { 'id': '2019449', 'ext': 'mp4', 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_title = self._search_regex( r'mediaTitle = "([^"]+)"', webpage, 'title') gexo_files = self._parse_json( self._search_regex( r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), video_id) formats = [] for format_id, f in gexo_files.items(): video_url = url_or_none(f.get('url')) if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ 'url': video_url, 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), 'tbr': int_or_none(f.get('br')), }) self._sort_formats(formats) thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': video_title, 'formats': formats, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/clippit.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_iso8601, qualities, ) import re class ClippitIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' _TEST = { 'url': 'https://www.clippituser.tv/c/evmgm', 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', 'info_dict': { 'id': 'evmgm', 'ext': 'mp4', 'title': 'Bye bye Brutus. #BattleBots - Clippit', 'uploader': 'lizllove', 'uploader_url': 'https://www.clippituser.tv/p/lizllove', 'timestamp': 1472183818, 'upload_date': '20160826', 'description': 'BattleBots | ABC', 'thumbnail': r're:^https?://.*\.jpg$', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'<title.*>(.+?)', webpage, 'title') FORMATS = ('sd', 'hd') quality = qualities(FORMATS) formats = [] for format_id in FORMATS: url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, webpage, 'url', fatal=False) if not url: continue match = re.search(r'/(?P\d+)\.mp4', url) formats.append({ 'url': url, 'format_id': format_id, 'quality': quality(format_id), 'height': int(match.group('height')) if match else None, }) uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', webpage, 'uploader', fatal=False) uploader_url = ('https://www.clippituser.tv/p/' + uploader if uploader else None) timestamp = self._html_search_regex(r'datetime="(.+?)"', webpage, 'date', fatal=False) thumbnail = self._html_search_regex(r'data-image="(.+?)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, 'formats': formats, 'uploader': uploader, 'uploader_url': uploader_url, 'timestamp': parse_iso8601(timestamp), 'description': self._og_search_description(webpage), 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/cliprs.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .onet import OnetBaseIE class ClipRsIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', 'info_dict': { 'id': '1488842.1399140381', 'ext': 'mp4', 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', 'duration': 229, 'timestamp': 1459850243, 'upload_date': '20160405', } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) mvp_id = self._search_mvp_id(webpage) info_dict = self._extract_from_id(mvp_id, webpage) info_dict['display_id'] = display_id return info_dict ================================================ FILE: youtube_dl/extractor/clipsyndicate.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( find_xpath_attr, fix_xml_ampersands ) class ClipsyndicateIE(InfoExtractor): _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'md5': '4d7d549451bad625e0ff3d7bd56d776c', 'info_dict': { 'id': '4629301', 'ext': 'mp4', 'title': 'Brick Briscoe', 'duration': 612, 'thumbnail': r're:^https?://.+\.jpg', }, }, { 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, video_id, 'Downlaoding player') # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, video_id, 'Downloading video info', transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') def find_param(name): node = find_xpath_attr(track_doc, './/param', 'name', name) if node is not None: return node.attrib['value'] return { 'id': video_id, 'title': find_param('title'), 'url': track_doc.find('location').text, 'thumbnail': find_param('thumbnail'), 'duration': int(find_param('duration')), } ================================================ FILE: youtube_dl/extractor/closertotruth.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', 'info_dict': { 'id': '0_zof1ktre', 'display_id': 'solutions-the-mind-body-problem', 'ext': 'mov', 'title': 'Solutions to the Mind-Body Problem?', 'upload_date': '20140221', 'timestamp': 1392956007, 'uploader_id': 'CTTXML' }, 'params': { 'skip_download': True, }, }, { 'url': 'http://closertotruth.com/episodes/how-do-brains-work', 'info_dict': { 'id': '0_iuxai6g6', 'display_id': 'how-do-brains-work', 'ext': 'mov', 'title': 'How do Brains Work?', 'upload_date': '20140221', 'timestamp': 1392956024, 'uploader_id': 'CTTXML' }, 'params': { 'skip_download': True, }, }, { 'url': 'http://closertotruth.com/interviews/1725', 'info_dict': { 'id': '1725', 'title': 'AyaFr-002', }, 'playlist_mincount': 2, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) partner_id = self._search_regex( r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') title = self._search_regex( r'(.+?)\s*\|\s*.+?', webpage, 'video title') select = self._search_regex( r'(?s)]+id="select-version"[^>]*>(.+?)', webpage, 'select version', default=None) if select: entry_ids = set() entries = [] for mobj in re.finditer( r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', webpage): entry_id = mobj.group('id') if entry_id in entry_ids: continue entry_ids.add(entry_id) entries.append({ '_type': 'url_transparent', 'url': 'kaltura:%s:%s' % (partner_id, entry_id), 'ie_key': 'Kaltura', 'title': mobj.group('title'), }) if entries: return self.playlist_result(entries, display_id, title) entry_id = self._search_regex( r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', webpage, 'kaltura entry_id', group='id') return { '_type': 'url_transparent', 'display_id': display_id, 'url': 'kaltura:%s:%s' % (partner_id, entry_id), 'ie_key': 'Kaltura', 'title': title } ================================================ FILE: youtube_dl/extractor/cloudflarestream.py ================================================ # coding: utf-8 from __future__ import unicode_literals import base64 import re from .common import InfoExtractor class CloudflareStreamIE(InfoExtractor): _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' _VALID_URL = r'''(?x) https?:// (?: (?:watch\.)?%s/| %s ) (?P<id>%s) ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { 'id': '31c9291ab41fac05471db4e73aa11717', 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', }, 'params': { 'skip_download': True, }, }, { 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', 'only_matching': True, }, { 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', 'only_matching': True, }, { 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return [ mobj.group('url') for mobj in re.finditer( r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), webpage)] def _real_extract(self, url): video_id = self._match_id(url) domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' base_url = 'https://%s/%s/' % (domain, video_id) if '.' in video_id: video_id = self._parse_json(base64.urlsafe_b64decode( video_id.split('.')[1]), video_id)['sub'] manifest_base_url = base_url + 'manifest/video.' formats = self._extract_m3u8_formats( manifest_base_url + 'm3u8', video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': video_id, 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, } ================================================ FILE: youtube_dl/extractor/cloudy.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( str_to_int, unified_strdate, ) class CloudyIE(InfoExtractor): IE_DESC = 'cloudy.ec' _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'https://www.cloudy.ec/v/af511e2527aac', 'md5': '29832b05028ead1b58be86bf319397ca', 'info_dict': { 'id': 'af511e2527aac', 'ext': 'mp4', 'title': 'Funny Cats and Animals Compilation june 2013', 'upload_date': '20130913', 'view_count': int, } }, { 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://www.cloudy.ec/embed.php', video_id, query={ 'id': video_id, 'playerPage': 1, 'autoplay': 1, }) info = self._parse_html5_media_entries(url, webpage, video_id)[0] webpage = self._download_webpage( 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) if webpage: info.update({ 'title': self._search_regex( r'<h\d[^>]*>([^<]+)<', webpage, 'title'), 'upload_date': unified_strdate(self._search_regex( r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, 'upload date', fatal=False)), 'view_count': str_to_int(self._search_regex( r'([\d,.]+) views<', webpage, 'view count', fatal=False)), }) if not info.get('title'): info['title'] = video_id info['id'] = video_id return info ================================================ FILE: youtube_dl/extractor/clubic.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, qualities, ) class ClubicIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', 'md5': '1592b694ba586036efac1776b0b43cd3', 'info_dict': { 'id': '448474', 'ext': 'mp4', 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', 'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$', } }, { 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id player_page = self._download_webpage(player_url, video_id) config = self._parse_json(self._search_regex( r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, 'configuration'), video_id) video_info = config['videoInfo'] sources = config['sources'] quality_order = qualities(['sd', 'hq']) formats = [{ 'format_id': src['streamQuality'], 'url': src['src'], 'quality': quality_order(src['streamQuality']), } for src in sources] self._sort_formats(formats) return { 'id': video_id, 'title': video_info['title'], 'formats': formats, 'description': clean_html(video_info.get('description')), 'thumbnail': config.get('poster'), } ================================================ FILE: youtube_dl/extractor/clyp.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( float_or_none, unified_timestamp, ) class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' _TESTS = [{ 'url': 'https://clyp.it/ojz2wfah', 'md5': '1d4961036c41247ecfdcc439c0cddcbb', 'info_dict': { 'id': 'ojz2wfah', 'ext': 'mp3', 'title': 'Krisson80 - bits wip wip', 'description': '#Krisson80BitsWipWip #chiptune\n#wip', 'duration': 263.21, 'timestamp': 1443515251, 'upload_date': '20150929', }, }, { 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', 'info_dict': { 'id': 'b04p1odi', 'ext': 'mp3', 'title': 'GJ! (Reward Edit)', 'description': 'Metal Resistance (THE ONE edition)', 'duration': 177.789, 'timestamp': 1528241278, 'upload_date': '20180605', }, 'params': { 'skip_download': True, }, }] def _real_extract(self, url): audio_id = self._match_id(url) qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) token = qs.get('token', [None])[0] query = {} if token: query['token'] = token metadata = self._download_json( 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) formats = [] for secure in ('', 'Secure'): for ext in ('Ogg', 'Mp3'): format_id = '%s%s' % (secure, ext) format_url = metadata.get('%sUrl' % format_id) if format_url: formats.append({ 'url': format_url, 'format_id': format_id, 'vcodec': 'none', }) self._sort_formats(formats) title = metadata['Title'] description = metadata.get('Description') duration = float_or_none(metadata.get('Duration')) timestamp = unified_timestamp(metadata.get('DateCreated')) return { 'id': audio_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/cmt.py ================================================ from __future__ import unicode_literals from .mtv import MTVIE class CMTIE(MTVIE): IE_NAME = 'cmt.com' _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', 'info_dict': { 'id': '989124', 'ext': 'mp4', 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, 'skip': 'Video not available', }, { 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', 'info_dict': { 'id': '1504699', 'ext': 'mp4', 'title': 'Still The King Ep. 109 in 3 Minutes', 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.', 'timestamp': 1469421000.0, 'upload_date': '20160725', }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }, { 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', 'only_matching': True, }, { 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', 'only_matching': True, }] def _extract_mgid(self, webpage): mgid = self._search_regex( r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', webpage, 'mgid', group='mgid', default=None) if not mgid: mgid = self._extract_triforce_mgid(webpage) return mgid def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) mgid = self._extract_mgid(webpage) return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) ================================================ FILE: youtube_dl/extractor/cnbc.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import smuggle_url class CNBCIE(InfoExtractor): _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' _TEST = { 'url': 'http://video.cnbc.com/gallery/?video=3000503714', 'info_dict': { 'id': '3000503714', 'ext': 'mp4', 'title': 'Fighting zombies is big business', 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', 'timestamp': 1459332000, 'upload_date': '20160330', 'uploader': 'NBCU-CNBC', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, {'force_smil_url': True}), 'id': video_id, } class CNBCVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { 'id': '7000031301', 'ext': 'mp4', 'title': "Trump: I don't necessarily agree with raising rates", 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', 'timestamp': 1531958400, 'upload_date': '20180719', 'uploader': 'NBCU-CNBC', }, 'params': { 'skip_download': True, }, } def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() video_id = self._download_json( 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ 'query': '''{ page(path: "%s") { vcpsId } }''' % path, })['data']['page']['vcpsId'] return self.url_result( 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) ================================================ FILE: youtube_dl/extractor/cnn.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from .turner import TurnerBaseIE from ..utils import url_basename class CNNIE(TurnerBaseIE): _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', 'duration': 135, 'upload_date': '20130609', }, 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', 'info_dict': { 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', 'ext': 'mp4', 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", 'upload_date': '20130821', }, 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', 'info_dict': { 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', 'ext': 'mp4', 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', }, 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', 'info_dict': { 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', 'ext': 'mp4', 'title': '5 stunning stats about Netflix', 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', 'upload_date': '20160819', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, }, { 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', 'only_matching': True, }] _CONFIG = { # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml 'edition': { 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', 'media_src': 'http://pmd.cdn.turner.com/cnn/big', }, # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml 'money': { 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', 'media_src': 'http://ht3.cdn.turner.com/money/big', }, } def _extract_timestamp(self, video_data): # TODO: fix timestamp extraction return None def _real_extract(self, url): sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() if sub_domain not in ('money', 'edition'): sub_domain = 'edition' config = self._CONFIG[sub_domain] return self._extract_cvp_info( config['data_src'] % path, page_title, { 'default': { 'media_src': config['media_src'], }, 'f4m': { 'host': 'cnn-vh.akamaihd.net', }, }) class CNNBlogsIE(InfoExtractor): _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' _TEST = { 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', 'info_dict': { 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', 'ext': 'mp4', 'title': 'Criminalizing journalism?', 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', 'upload_date': '20140209', }, 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') return self.url_result(cnn_url, CNNIE.ie_key()) class CNNArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', 'info_dict': { 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', 'upload_date': '20141221', }, 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) ================================================ FILE: youtube_dl/extractor/comedycentral.py ================================================ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', 'timestamp': 1598670000, 'upload_date': '20200829', }, }, { 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, }, { 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', 'ext': 'mp4', 'title': 'Josh Investigates', 'description': 'Steht uns das Ende der Welt bevor?', }, }] _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _GEO_COUNTRIES = ['DE'] def _get_feed_query(self, uri): return { 'accountOverride': 'intl.mtvi.com', 'arcEp': 'web.cc.tv', 'ep': 'b9032c3a', 'imageEp': 'web.cc.tv', 'mgid': uri, } ================================================ FILE: youtube_dl/extractor/common.py ================================================ # coding: utf-8 from __future__ import unicode_literals import base64 import collections import datetime import functools import hashlib import json import netrc import os import random import re import socket import ssl import sys import time import math from ..compat import ( compat_cookiejar_Cookie, compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_kwargs, compat_map as map, compat_open as open, compat_os_name, compat_str, compat_urllib_error, compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, compat_xml_parse_error, compat_zip as zip, ) from ..downloader.f4m import ( get_base_url, remove_encrypted_media, ) from ..utils import ( NO_DEFAULT, age_restricted, base_url, bug_reports_message, clean_html, compiled_regex_type, determine_ext, determine_protocol, dict_get, error_to_compat_str, ExtractorError, extract_attributes, fix_xml_ampersands, float_or_none, GeoRestrictedError, GeoUtils, int_or_none, join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, orderedSet, parse_bitrate, parse_codecs, parse_duration, parse_iso8601, parse_m3u8_attributes, parse_resolution, RegexNotFoundError, sanitized_Request, sanitize_filename, str_or_none, str_to_int, strip_or_none, T, traverse_obj, try_get, unescapeHTML, unified_strdate, unified_timestamp, update_Request, update_url_query, urljoin, url_basename, url_or_none, variadic, xpath_element, xpath_text, xpath_with_ns, ) class InfoExtractor(object): """Information Extractor class. Information extractors are the classes that, given a URL, extract information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. The type field determines the type of the result. By far the most common value (and the default if _type is missing) is "video", which indicates a single video. For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. Additionally, it must contain either a formats entry or a url one: formats: A list of dictionaries for each format available, ordered from worst to best quality. Potential fields: * url The mandatory URL representing the media: for plain file media - HTTP URL of this file, for RTMP - RTMP URL, for HLS - URL of the M3U8 media playlist, for HDS - URL of the F4M manifest, for DASH - HTTP URL to plain file media (in case of unfragmented media) - URL of the MPD manifest or base URL representing the media if MPD manifest is parsed from a string (in case of fragmented media) for MSS - URL of the ISM manifest. * manifest_url The URL of the manifest file in case of fragmented media: for HLS - URL of the M3U8 master playlist, for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format ("mp4_h264_opus" or "19"). Technically optional, but strongly recommended. * format_note Additional info about the format ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known * resolution Textual description of width and height * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to this URL. * fragments A list of fragments of a fragmented media. Each fragment entry must contain either an url or a path. If an url is present it should be considered by a client. Otherwise both path and fragment_base_url must be present. Here is the list of all potential fields: * "url" - fragment's URL * "path" - fragment's path relative to fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) * "range" (optional, str of the form "start-end" to use in HTTP Range header) * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. < -1000 to hide the format (if there is another one which is strictly better) * language Language code, e.g. "de" or "en-US". * language_preference Is this in the language mentioned in the URL? 10 if it's what the URL is about, -1 for default (don't know), -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. * source_preference Order number for this video source (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. * http_headers A dictionary of additional HTTP headers to add to the request. * stretched_ratio If given and not 1, indicates that the video's pixels are not square. width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. * available_at Unix timestamp of when a format will be available to download * downloader_options A dictionary of downloader options as described in FileDownloader url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) player_url: SWF Player URL (used for rtmpdump). The following fields are optional: alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries, with the following entries: * "id" (optional, string) - Thumbnail format ID * "url" * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. Note that channel fields may or may not repeat uploader fields. This depends on a particular extractor. channel_id: Id of the channel. channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and "subformats" is a list sorted from lower to higher preference, each element is a dictionary with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following properties (all but one of text or html optional): * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment * "timestamp" - UNIX timestamp of comment * "parent" - ID of the comment this one is replying to. Set to "root" to indicate that this is a comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. chapters: A list of dictionaries, with the following entries: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: chapter: Name or title of the chapter the video belongs to. chapter_number: Number of the chapter the video belongs to, as an integer. chapter_id: Id of the chapter the video belongs to, as a unicode string. The following fields should only be used when the video is an episode of some series, programme or podcast: series: Title of the series or programme the video episode belongs to. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. episode: Title of the video episode. Unlike mandatory video title field, this field should denote the exact title of the video episode without any kind of decoration. episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. The following fields should only be used when the media is a track or a part of a music album: track: Title of the track. track_number: Number of the track within an album or a disc, as an integer. track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), as a unicode string. artist: Artist(s) of the track. genre: Genre(s) of the track. album: Title of the album the track belongs to. album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). album_artist: List of all artists appeared on the album (e.g. "Ash Borer / Fell Voices" or "Various Artists", useful for splits and compilations). disc_number: Number of the disc or other physical medium the track belongs to, as an integer. release_year: Year (YYYY) when the album was released. Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. _type "playlist" indicates multiple videos. There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. Additionally, playlists can have "id", "title", "description", "uploader", "uploader_id", "uploader_url", "duration" attributes with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. It must have an entries key like a playlist and contain all the keys required for a video at the same time. _type "url" indicates that the video must be extracted from another location, possibly by a different extractor. Its only required key is: "url" - the next URL to extract. The key "ie_key" can be set to the class name (minus the trailing "IE", e.g. "Youtube") if the extractor class is known in advance. Additionally, the dictionary may have any properties of the resolved entity known in advance, for example "title" if the title of the referred video is known ahead of time. _type "url_transparent" entities have the same specification as "url", but indicate that the given additional information is more precise than the one associated with the resolved URL. This is useful when a site employs a video service that hosts the video and its technical metadata, but that video service does not embed a useful title, description etc. A subclass of InfoExtractor must be defined to handle each specific site (or several sites). Such a concrete subclass should be added to the list of extractors. It should also: * define its _VALID_URL attribute as a regexp, or a Sequence of alternative regexps (but see below) * re-define the _real_extract() method * optionally re-define the _real_initialize() method. An extractor subclass may also override suitable() if necessary, but the function signature must be preserved and the function must import everything it needs (except other extractors), so that lazy_extractors works correctly. If the subclass's suitable() and _real_extract() functions avoid using _VALID_URL, the subclass need not set that class attribute. An abstract subclass of InfoExtractor may be used to simplify implementation within an extractor module; it should not be added to the list of extractors. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on country code provided with geo_bypass_country. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted countries for this extractor. One of these countries will be used by geo restriction bypass mechanism right away in order to bypass geo restriction, of course, if the mechanism is not disabled. _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted IP blocks in CIDR notation for this extractor. One of these IP blocks will be used by geo restriction bypass mechanism similarly to _GEO_COUNTRIES. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ _ready = False _downloader = None _x_forwarded_for_ip = None _GEO_BYPASS = True _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True # supply this in public subclasses: used in supported sites list, etc # IE_DESC = 'short description of IE' def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False self._x_forwarded_for_ip = None self.set_downloader(downloader) @classmethod def __match_valid_url(cls, url): # This does not use has/getattr intentionally - we want to know whether # we have cached the regexp for cls, whereas getattr would also # match its superclass if '_VALID_URL_RE' not in cls.__dict__: # _VALID_URL can now be a list/tuple of patterns cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7 for p in cls._VALID_URL_RE: p = p.match(url) if p: return p # The public alias can safely be overridden, as in some back-ports _match_valid_url = __match_valid_url @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" # This function must import everything it needs (except other extractors), # so that lazy_extractors works correctly return cls.__match_valid_url(url) is not None @classmethod def _match_id(cls, url): m = cls.__match_valid_url(url) assert m return compat_str(m.group('id')) @classmethod def working(cls): """Getter method for _WORKING.""" return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" self._initialize_geo_bypass({ 'countries': self._GEO_COUNTRIES, 'ip_blocks': self._GEO_IP_BLOCKS, }) if not self._ready: self._real_initialize() self._ready = True def _initialize_geo_bypass(self, geo_bypass_context): """ Initialize geo restriction bypass mechanism. This method is used to initialize geo bypass mechanism based on faking X-Forwarded-For HTTP header. A random country from provided country list is selected and a random IP belonging to this country is generated. This IP will be passed as X-Forwarded-For HTTP header in all subsequent HTTP requests. This method will be used for initial geo bypass mechanism initialization during the instance initialization with _GEO_COUNTRIES and _GEO_IP_BLOCKS. You may also manually call it from extractor's code if geo bypass information is not available beforehand (e.g. obtained during extraction) or due to some other reason. In this case you should pass this information in geo bypass context passed as first argument. It may contain following fields: countries: List of geo unrestricted countries (similar to _GEO_COUNTRIES) ip_blocks: List of geo unrestricted IP blocks in CIDR notation (similar to _GEO_IP_BLOCKS) """ if not self._x_forwarded_for_ip: # Geo bypass mechanism is explicitly disabled by user if not self.get_param('geo_bypass', True): return if not geo_bypass_context: geo_bypass_context = {} # Backward compatibility: previously _initialize_geo_bypass # expected a list of countries, some 3rd party code may still use # it this way if isinstance(geo_bypass_context, (list, tuple)): geo_bypass_context = { 'countries': geo_bypass_context, } # The whole point of geo bypass mechanism is to fake IP # as X-Forwarded-For HTTP header based on some IP block or # country code. # Path 1: bypassing based on IP block in CIDR notation # Explicit IP block specified by user, use it right away # regardless of whether extractor is geo bypassable or not ip_block = self.get_param('geo_bypass_ip_block', None) # Otherwise use random IP block from geo bypass context but only # if extractor is known as geo bypassable if not ip_block: ip_blocks = geo_bypass_context.get('ip_blocks') if self._GEO_BYPASS and ip_blocks: ip_block = random.choice(ip_blocks) if ip_block: self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) if self.get_param('verbose', False): self.to_screen( '[debug] Using fake IP %s as X-Forwarded-For.' % self._x_forwarded_for_ip) return # Path 2: bypassing based on country code # Explicit country code specified by user, use it right away # regardless of whether extractor is geo bypassable or not country = self.get_param('geo_bypass_country', None) # Otherwise use random country code from geo bypass context but # only if extractor is known as geo bypassable if not country: countries = geo_bypass_context.get('countries') if self._GEO_BYPASS and countries: country = random.choice(countries) if country: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) if self.get_param('verbose', False): self.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' % (self._x_forwarded_for_ip, country.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" try: for _ in range(2): try: self.initialize() ie_result = self._real_extract(url) if self._x_forwarded_for_ip: ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip return ie_result except GeoRestrictedError as e: if self.__maybe_fake_ip_and_retry(e.countries): continue raise except ExtractorError: raise except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e) def __maybe_fake_ip_and_retry(self, countries): if (not self.get_param('geo_bypass_country', None) and self._GEO_BYPASS and self.get_param('geo_bypass', True) and not self._x_forwarded_for_ip and countries): country_code = random.choice(countries) self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' % (self._x_forwarded_for_ip, country_code.upper())) return True return False def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader @property def cache(self): return self._downloader.cache @property def cookiejar(self): return self._downloader.cookiejar def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" pass @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): return compat_str(type(self).__name__[:-2]) @staticmethod def __can_accept_status_code(err, expected_status): assert isinstance(err, compat_urllib_error.HTTPError) if expected_status is None: return False if isinstance(expected_status, compat_integer_types): return err.code == expected_status elif isinstance(expected_status, (list, tuple)): return err.code in expected_status elif callable(expected_status): return expected_status(err.code) is True else: assert False def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): """ Return the response handle. See _download_webpage docstring for arguments specification. """ if note is None: self.report_download_webpage(video_id) elif note is not False: if video_id is None: self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) # Some sites check X-Forwarded-For HTTP header in order to figure out # the origin of the client behind proxy. This allows bypassing geo # restriction by faking this header's value to IP that belongs to some # geo unrestricted country. We will do so once we encounter any # geo restriction error. if self._x_forwarded_for_ip: if 'X-Forwarded-For' not in headers: headers['X-Forwarded-For'] = self._x_forwarded_for_ip if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) else: if query: url_or_request = update_url_query(url_or_request, query) if data is not None or headers: url_or_request = sanitized_Request(url_or_request, data, headers) exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] if hasattr(ssl, 'CertificateError'): exceptions.append(ssl.CertificateError) try: return self._downloader.urlopen(url_or_request) except tuple(exceptions) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): # Retain reference to error to prevent file object from # being closed before it can be read. Works around the # effects of <https://bugs.python.org/issue15002> # introduced in Python 3.4.1. err.fp._error = err return err.fp if errnote is False: return False if errnote is None: errnote = 'Unable to download webpage' errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: self.report_warning(errmsg) return False def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): """ Return a tuple (page content as string, URL handle). See _download_webpage docstring for arguments specification. """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) if urlh is False: assert not fatal return False content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) @staticmethod def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') elif webpage_bytes.startswith(b'\xff\xfe'): encoding = 'utf-16' else: encoding = 'utf-8' return encoding def __check_blocked(self, content): first_block = content[:512] if ('<title>Access to this site is blocked' in content and 'Websense' in first_block): msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' blocked_iframe = self._html_search_regex( r'' xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root', default=None) if xml_root is None: # Probably need to authenticate login_res = self._login(webpage_url, display_id) if login_res is None: self.report_warning('Could not login.') else: start_page = login_res # Grab the url from the authenticated page xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root') xml_name = self._html_search_regex( r'', webpage): url = self._search_regex( r'src=(["\'])(?P.+?partnerplayer.+?)\1', iframe, 'player URL', default=None, group='url') if url: break if not url: url = self._og_search_url(webpage) mobj = re.match( self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: display_id = player_id if player_id: player_page = self._download_webpage( url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( r'\d+)' _TEST = { 'url': 'http://www.pearvideo.com/video_1076290', 'info_dict': { 'id': '1076290', 'ext': 'mp4', 'title': '小浣熊在主人家玻璃上滚石头:没砸', 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', 'timestamp': 1494275280, 'upload_date': '20170508', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) quality = qualities( ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) formats = [{ 'url': mobj.group('url'), 'format_id': mobj.group('id'), 'quality': quality(mobj.group('id')), } for mobj in re.finditer( r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', webpage)] self._sort_formats(formats) title = self._search_regex( (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), webpage, 'title', group='value') description = self._search_regex( (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), webpage, 'description', default=None, group='value') or self._html_search_meta('Description', webpage) timestamp = unified_timestamp(self._search_regex( r']+\bclass=["\']date["\'][^>]*>([^<]+)', webpage, 'timestamp', fatal=False)) return { 'id': video_id, 'title': title, 'description': description, 'timestamp': timestamp, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/peekvids.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, get_element_by_class, int_or_none, merge_dicts, url_or_none, ) class PeekVidsIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)?peekvids\.com/ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?P[^/?&#]*) ''' _TESTS = [{ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', 'info_dict': { 'id': '1262717', 'display_id': 'BSyLMbN0YCd', 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1642579329, 'upload_date': '20220119', 'duration': 416, 'view_count': int, 'age_limit': 18, 'uploader': 'SEXYhub.com', 'categories': list, 'tags': list, }, }] _DOMAIN = 'www.peekvids.com' def _get_detail(self, html): return get_element_by_class('detail-video-block', html) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id, expected_status=429) if '>Rate Limit Exceeded' in webpage: raise ExtractorError( '[%s] %s: %s' % (self.IE_NAME, video_id, 'You are suspected as a bot. Wait, or pass the captcha test on the site and provide --cookies.'), expected=True) title = self._html_search_regex(r'(?s)]*>(.+?)', webpage, 'title') display_id = video_id video_id = self._search_regex(r'(?s)]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') srcs = self._download_json( 'https://%s/v-alt/%s' % (self._DOMAIN, video_id), video_id, note='Downloading list of source files') formats = [{ 'url': f_url, 'format_id': f_id, 'height': int_or_none(f_id), } for f_url, f_id in ( (url_or_none(f_v), f_match.group(1)) for f_v, f_match in ( (v, re.match(r'^data-src(\d{3,})$', k)) for k, v in srcs.items() if v) if f_match) if f_url ] if not formats: formats = [{'url': url} for url in srcs.values()] self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) info.pop('url', None) # may not have found the thumbnail if it was in a list in the ld+json info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) detail = self._get_detail(webpage) or '' info['description'] = self._html_search_regex( r'(?s)(.+?)(?:%s\s*<|]*>\s*%s\s*:\s*(.+?)' % (re.escape(name), ), html, name, default='') return [x for x in re.split(r'\s+', l) if x] return merge_dicts({ 'id': video_id, 'display_id': display_id, 'age_limit': 18, 'formats': formats, 'categories': cat_tags('Categories', detail), 'tags': cat_tags('Tags', detail), 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), }, info) class PlayVidsIE(PeekVidsIE): _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|\w\w?/)?(?P[^/?#]*)' _TESTS = [{ 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', 'md5': '2f12e50213dd65f142175da633c4564c', 'info_dict': { 'id': '1978030', 'display_id': 'U3pBrYhsjXM', 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1640435839, 'upload_date': '20211225', 'duration': 416, 'view_count': int, 'age_limit': 18, 'uploader': 'SEXYhub.com', 'categories': list, 'tags': list, }, }, { 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', 'only_matching': True, }, { 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', 'only_matching': True, }, { 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', 'md5': 'e783986e596cafbf46411a174ab42ba6', 'info_dict': { 'id': '762385', 'display_id': 'bKmGLe3IwjZ', 'ext': 'mp4', 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', 'timestamp': 1516958544, 'upload_date': '20180126', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 480, 'uploader': 'Brazzers', 'age_limit': 18, 'view_count': int, 'age_limit': 18, 'categories': list, 'tags': list, }, }, { 'url': 'https://www.playvids.com/v/47iUho33toY', 'md5': 'b056b5049d34b648c1e86497cf4febce', 'info_dict': { 'id': '700621', 'display_id': '47iUho33toY', 'ext': 'mp4', 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', 'description': None, 'timestamp': 1507052209, 'upload_date': '20171003', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 332, 'uploader': 'Cacerenele', 'age_limit': 18, 'view_count': int, 'categories': list, 'tags': list, } }, { 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', 'info_dict': { 'id': '1523518', 'display_id': 'z3_7iwWCmqt', 'ext': 'mp4', 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', 'description': None, 'timestamp': 1607470323, 'upload_date': '20201208', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 593, 'uploader': 'yorours', 'age_limit': 18, 'view_count': int, 'categories': list, 'tags': list, }, }] _DOMAIN = 'www.playvids.com' def _get_detail(self, html): return get_element_by_class('detail-block', html) ================================================ FILE: youtube_dl/extractor/peertube.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, parse_resolution, str_or_none, try_get, unified_timestamp, url_or_none, urljoin, ) class PeerTubeIE(InfoExtractor): _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances peertube\.rainbowswingers\.net| tube\.stanisic\.nl| peer\.suiri\.us| medias\.libox\.fr| videomensoif\.ynh\.fr| peertube\.travelpandas\.eu| peertube\.rachetjay\.fr| peertube\.montecsys\.fr| tube\.eskuero\.me| peer\.tube| peertube\.umeahackerspace\.se| tube\.nx-pod\.de| video\.monsieurbidouille\.fr| tube\.openalgeria\.org| vid\.lelux\.fi| video\.anormallostpod\.ovh| tube\.crapaud-fou\.org| peertube\.stemy\.me| lostpod\.space| exode\.me| peertube\.snargol\.com| vis\.ion\.ovh| videosdulib\.re| v\.mbius\.io| videos\.judrey\.eu| peertube\.osureplayviewer\.xyz| peertube\.mathieufamily\.ovh| www\.videos-libr\.es| fightforinfo\.com| peertube\.fediverse\.ru| peertube\.oiseauroch\.fr| video\.nesven\.eu| v\.bearvideo\.win| video\.qoto\.org| justporn\.cc| video\.vny\.fr| peervideo\.club| tube\.taker\.fr| peertube\.chantierlibre\.org| tube\.ipfixe\.info| tube\.kicou\.info| tube\.dodsorf\.as| videobit\.cc| video\.yukari\.moe| videos\.elbinario\.net| hkvideo\.live| pt\.tux\.tf| www\.hkvideo\.live| FIGHTFORINFO\.com| pt\.765racing\.com| peertube\.gnumeria\.eu\.org| nordenmedia\.com| peertube\.co\.uk| tube\.darfweb\.eu| tube\.kalah-france\.org| 0ch\.in| vod\.mochi\.academy| film\.node9\.org| peertube\.hatthieves\.es| video\.fitchfamily\.org| peertube\.ddns\.net| video\.ifuncle\.kr| video\.fdlibre\.eu| tube\.22decembre\.eu| peertube\.harmoniescreatives\.com| tube\.fabrigli\.fr| video\.thedwyers\.co| video\.bruitbruit\.com| peertube\.foxfam\.club| peer\.philoxweb\.be| videos\.bugs\.social| peertube\.malbert\.xyz| peertube\.bilange\.ca| libretube\.net| diytelevision\.com| peertube\.fedilab\.app| libre\.video| video\.mstddntfdn\.online| us\.tv| peertube\.sl-network\.fr| peertube\.dynlinux\.io| peertube\.david\.durieux\.family| peertube\.linuxrocks\.online| peerwatch\.xyz| v\.kretschmann\.social| tube\.otter\.sh| yt\.is\.nota\.live| tube\.dragonpsi\.xyz| peertube\.boneheadmedia\.com| videos\.funkwhale\.audio| watch\.44con\.com| peertube\.gcaillaut\.fr| peertube\.icu| pony\.tube| spacepub\.space| tube\.stbr\.io| v\.mom-gay\.faith| tube\.port0\.xyz| peertube\.simounet\.net| play\.jergefelt\.se| peertube\.zeteo\.me| tube\.danq\.me| peertube\.kerenon\.com| tube\.fab-l3\.org| tube\.calculate\.social| peertube\.mckillop\.org| tube\.netzspielplatz\.de| vod\.ksite\.de| peertube\.laas\.fr| tube\.govital\.net| peertube\.stephenson\.cc| bistule\.nohost\.me| peertube\.kajalinifi\.de| video\.ploud\.jp| video\.omniatv\.com| peertube\.ffs2play\.fr| peertube\.leboulaire\.ovh| peertube\.tronic-studio\.com| peertube\.public\.cat| peertube\.metalbanana\.net| video\.1000i100\.fr| peertube\.alter-nativ-voll\.de| tube\.pasa\.tf| tube\.worldofhauru\.xyz| pt\.kamp\.site| peertube\.teleassist\.fr| videos\.mleduc\.xyz| conf\.tube| media\.privacyinternational\.org| pt\.forty-two\.nl| video\.halle-leaks\.de| video\.grosskopfgames\.de| peertube\.schaeferit\.de| peertube\.jackbot\.fr| tube\.extinctionrebellion\.fr| peertube\.f-si\.org| video\.subak\.ovh| videos\.koweb\.fr| peertube\.zergy\.net| peertube\.roflcopter\.fr| peertube\.floss-marketing-school\.com| vloggers\.social| peertube\.iriseden\.eu| videos\.ubuntu-paris\.org| peertube\.mastodon\.host| armstube\.com| peertube\.s2s\.video| peertube\.lol| tube\.open-plug\.eu| open\.tube| peertube\.ch| peertube\.normandie-libre\.fr| peertube\.slat\.org| video\.lacaveatonton\.ovh| peertube\.uno| peertube\.servebeer\.com| peertube\.fedi\.quebec| tube\.h3z\.jp| tube\.plus200\.com| peertube\.eric\.ovh| tube\.metadocs\.cc| tube\.unmondemeilleur\.eu| gouttedeau\.space| video\.antirep\.net| nrop\.cant\.at| tube\.ksl-bmx\.de| tube\.plaf\.fr| tube\.tchncs\.de| video\.devinberg\.com| hitchtube\.fr| peertube\.kosebamse\.com| yunopeertube\.myddns\.me| peertube\.varney\.fr| peertube\.anon-kenkai\.com| tube\.maiti\.info| tubee\.fr| videos\.dinofly\.com| toobnix\.org| videotape\.me| voca\.tube| video\.heromuster\.com| video\.lemediatv\.fr| video\.up\.edu\.ph| balafon\.video| video\.ivel\.fr| thickrips\.cloud| pt\.laurentkruger\.fr| video\.monarch-pass\.net| peertube\.artica\.center| video\.alternanet\.fr| indymotion\.fr| fanvid\.stopthatimp\.net| video\.farci\.org| v\.lesterpig\.com| video\.okaris\.de| tube\.pawelko\.net| peertube\.mablr\.org| tube\.fede\.re| pytu\.be| evertron\.tv| devtube\.dev-wiki\.de| raptube\.antipub\.org| video\.selea\.se| peertube\.mygaia\.org| video\.oh14\.de| peertube\.livingutopia\.org| peertube\.the-penguin\.de| tube\.thechangebook\.org| tube\.anjara\.eu| pt\.pube\.tk| video\.samedi\.pm| mplayer\.demouliere\.eu| widemus\.de| peertube\.me| peertube\.zapashcanon\.fr| video\.latavernedejohnjohn\.fr| peertube\.pcservice46\.fr| peertube\.mazzonetto\.eu| video\.irem\.univ-paris-diderot\.fr| video\.livecchi\.cloud| alttube\.fr| video\.coop\.tools| video\.cabane-libre\.org| peertube\.openstreetmap\.fr| videos\.alolise\.org| irrsinn\.video| video\.antopie\.org| scitech\.video| tube2\.nemsia\.org| video\.amic37\.fr| peertube\.freeforge\.eu| video\.arbitrarion\.com| video\.datsemultimedia\.com| stoptrackingus\.tv| peertube\.ricostrongxxx\.com| docker\.videos\.lecygnenoir\.info| peertube\.togart\.de| tube\.postblue\.info| videos\.domainepublic\.net| peertube\.cyber-tribal\.com| video\.gresille\.org| peertube\.dsmouse\.net| cinema\.yunohost\.support| tube\.theocevaer\.fr| repro\.video| tube\.4aem\.com| quaziinc\.com| peertube\.metawurst\.space| videos\.wakapo\.com| video\.ploud\.fr| video\.freeradical\.zone| tube\.valinor\.fr| refuznik\.video| pt\.kircheneuenburg\.de| peertube\.asrun\.eu| peertube\.lagob\.fr| videos\.side-ways\.net| 91video\.online| video\.valme\.io| video\.taboulisme\.com| videos-libr\.es| tv\.mooh\.fr| nuage\.acostey\.fr| video\.monsieur-a\.fr| peertube\.librelois\.fr| videos\.pair2jeux\.tube| videos\.pueseso\.club| peer\.mathdacloud\.ovh| media\.assassinate-you\.net| vidcommons\.org| ptube\.rousset\.nom\.fr| tube\.cyano\.at| videos\.squat\.net| video\.iphodase\.fr| peertube\.makotoworkshop\.org| peertube\.serveur\.slv-valbonne\.fr| vault\.mle\.party| hostyour\.tv| videos\.hack2g2\.fr| libre\.tube| pire\.artisanlogiciel\.net| videos\.numerique-en-commun\.fr| video\.netsyms\.com| video\.die-partei\.social| video\.writeas\.org| peertube\.swarm\.solvingmaz\.es| tube\.pericoloso\.ovh| watching\.cypherpunk\.observer| videos\.adhocmusic\.com| tube\.rfc1149\.net| peertube\.librelabucm\.org| videos\.numericoop\.fr| peertube\.koehn\.com| peertube\.anarchmusicall\.net| tube\.kampftoast\.de| vid\.y-y\.li| peertube\.xtenz\.xyz| diode\.zone| tube\.egf\.mn| peertube\.nomagic\.uk| visionon\.tv| videos\.koumoul\.com| video\.rastapuls\.com| video\.mantlepro\.com| video\.deadsuperhero\.com| peertube\.musicstudio\.pro| peertube\.we-keys\.fr| artitube\.artifaille\.fr| peertube\.ethernia\.net| tube\.midov\.pl| peertube\.fr| watch\.snoot\.tube| peertube\.donnadieu\.fr| argos\.aquilenet\.fr| tube\.nemsia\.org| tube\.bruniau\.net| videos\.darckoune\.moe| tube\.traydent\.info| dev\.videos\.lecygnenoir\.info| peertube\.nayya\.org| peertube\.live| peertube\.mofgao\.space| video\.lequerrec\.eu| peertube\.amicale\.net| aperi\.tube| tube\.ac-lyon\.fr| video\.lw1\.at| www\.yiny\.org| videos\.pofilo\.fr| tube\.lou\.lt| choob\.h\.etbus\.ch| tube\.hoga\.fr| peertube\.heberge\.fr| video\.obermui\.de| videos\.cloudfrancois\.fr| betamax\.video| video\.typica\.us| tube\.piweb\.be| video\.blender\.org| peertube\.cat| tube\.kdy\.ch| pe\.ertu\.be| peertube\.social| videos\.lescommuns\.org| tv\.datamol\.org| videonaute\.fr| dialup\.express| peertube\.nogafa\.org| megatube\.lilomoino\.fr| peertube\.tamanoir\.foucry\.net| peertube\.devosi\.org| peertube\.1312\.media| tube\.bootlicker\.party| skeptikon\.fr| video\.blueline\.mg| tube\.homecomputing\.fr| tube\.ouahpiti\.info| video\.tedomum\.net| video\.g3l\.org| fontube\.fr| peertube\.gaialabs\.ch| tube\.kher\.nl| peertube\.qtg\.fr| video\.migennes\.net| tube\.p2p\.legal| troll\.tv| videos\.iut-orsay\.fr| peertube\.solidev\.net| videos\.cemea\.org| video\.passageenseine\.fr| videos\.festivalparminous\.org| peertube\.touhoppai\.moe| sikke\.fi| peer\.hostux\.social| share\.tube| peertube\.walkingmountains\.fr| videos\.benpro\.fr| peertube\.parleur\.net| peertube\.heraut\.eu| tube\.aquilenet\.fr| peertube\.gegeweb\.eu| framatube\.org| thinkerview\.video| tube\.conferences-gesticulees\.net| peertube\.datagueule\.tv| video\.lqdn\.fr| tube\.mochi\.academy| media\.zat\.im| video\.colibris-outilslibres\.org| tube\.svnet\.fr| peertube\.video| peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| peertube\.cpy\.re| canard\.tube )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' _VALID_URL = r'''(?x) (?: peertube:(?P[^:]+):| https?://(?P%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/ ) (?P%s) ''' % (_INSTANCES_RE, _UUID_RE) _TESTS = [{ 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'md5': '9bed8c0137913e17b86334e5885aacff', 'info_dict': { 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'ext': 'mp4', 'title': 'What is PeerTube?', 'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'thumbnail': r're:https?://.*\.(?:jpg|png)', 'timestamp': 1538391166, 'upload_date': '20181001', 'uploader': 'Framasoft', 'uploader_id': '3', 'uploader_url': 'https://framatube.org/accounts/framasoft', 'channel': 'Les vidéos de Framasoft', 'channel_id': '2', 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', 'language': 'en', 'license': 'Attribution - Share Alike', 'duration': 113, 'view_count': int, 'like_count': int, 'dislike_count': int, 'tags': ['framasoft', 'peertube'], 'categories': ['Science & Technology'], } }, { # Issue #26002 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', 'info_dict': { 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', 'ext': 'mp4', 'title': 'Dot matrix printer shell demo', 'uploader_id': '3', 'timestamp': 1587401293, 'upload_date': '20200420', 'uploader': 'Drew DeVault', } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, }, { # nsfw 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', 'only_matching': True, }, { 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', 'only_matching': True, }, { 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, }, { 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', 'only_matching': True, }] @staticmethod def _extract_peertube_url(webpage, source_url): mobj = re.match( r'https?://(?P[^/]+)/videos/(?:watch|embed)/(?P%s)' % PeerTubeIE._UUID_RE, source_url) if mobj and any(p in webpage for p in ( 'PeerTube<', 'There will be other non JS-based clients to access PeerTube', '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): return 'peertube:%s:%s' % mobj.group('host', 'id') @staticmethod def _extract_urls(webpage, source_url): entries = re.findall( r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) if not entries: peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) if peertube_url: entries = [peertube_url] return entries def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): return self._download_json( self._API_BASE % (host, video_id, path), video_id, note=note, errnote=errnote, fatal=fatal) def _get_subtitles(self, host, video_id): captions = self._call_api( host, video_id, 'captions', note='Downloading captions JSON', fatal=False) if not isinstance(captions, dict): return data = captions.get('data') if not isinstance(data, list): return subtitles = {} for e in data: language_id = try_get(e, lambda x: x['language']['id'], compat_str) caption_url = urljoin('https://%s' % host, e.get('captionPath')) if not caption_url: continue subtitles.setdefault(language_id or 'en', []).append({ 'url': caption_url, }) return subtitles def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or mobj.group('host_2') video_id = mobj.group('id') video = self._call_api( host, video_id, '', note='Downloading video JSON') title = video['name'] formats = [] files = video.get('files') or [] for playlist in (video.get('streamingPlaylists') or []): if not isinstance(playlist, dict): continue playlist_files = playlist.get('files') if not (playlist_files and isinstance(playlist_files, list)): continue files.extend(playlist_files) for file_ in files: if not isinstance(file_, dict): continue file_url = url_or_none(file_.get('fileUrl')) if not file_url: continue file_size = int_or_none(file_.get('size')) format_id = try_get( file_, lambda x: x['resolution']['label'], compat_str) f = parse_resolution(format_id) f.update({ 'url': file_url, 'format_id': format_id, 'filesize': file_size, }) if format_id == '0p': f['vcodec'] = 'none' else: f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) description = video.get('description') if len(description) >= 250: # description is shortened full_description = self._call_api( host, video_id, 'description', note='Downloading description JSON', fatal=False) if isinstance(full_description, dict): description = str_or_none(full_description.get('description')) or description subtitles = self.extract_subtitles(host, video_id) def data(section, field, type_): return try_get(video, lambda x: x[section][field], type_) def account_data(field, type_): return data('account', field, type_) def channel_data(field, type_): return data('channel', field, type_) category = data('category', 'label', compat_str) categories = [category] if category else None nsfw = video.get('nsfw') if nsfw is bool: age_limit = 18 if nsfw else 0 else: age_limit = None webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), 'uploader_url': url_or_none(account_data('url', compat_str)), 'channel': channel_data('displayName', compat_str), 'channel_id': str_or_none(channel_data('id', int)), 'channel_url': url_or_none(channel_data('url', compat_str)), 'language': data('language', 'id', compat_str), 'license': data('licence', 'label', compat_str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), 'dislike_count': int_or_none(video.get('dislikes')), 'age_limit': age_limit, 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, 'subtitles': subtitles, 'webpage_url': webpage_url, } ================================================ FILE: youtube_dl/extractor/people.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class PeopleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' _TEST = { 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', 'info_dict': { 'id': 'ref:20995451', 'ext': 'mp4', 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 246.318, 'timestamp': 1458720585, 'upload_date': '20160323', 'uploader_id': '416418724', }, 'params': { 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], } def _real_extract(self, url): return self.url_result( 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' % self._match_id(url), 'BrightcoveNew') ================================================ FILE: youtube_dl/extractor/performgroup.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class PerformGroupIE(InfoExtractor): _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})' _TESTS = [{ # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html 'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab', 'md5': '259cb03d142e2e52471e8837ecacb29f', 'info_dict': { 'id': 'xgrwobuzumes1lwjxtcdpwgxd', 'ext': 'mp4', 'title': 'Liga MX: Keine Einsicht nach Horrorfoul', 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', 'timestamp': 1511533477, 'upload_date': '20171124', } }] def _call_api(self, service, auth_token, content_id, referer_url): return self._download_json( 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), content_id, headers={ 'Referer': referer_url, 'Origin': 'http://player.performgroup.com', }, query={ '_fmt': 'json', }) def _real_extract(self, url): player_id, auth_token = re.search(self._VALID_URL, url).groups() bootstrap = self._call_api('bootstrap', auth_token, player_id, url) video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0] video_id = video['uuid'] vod = self._call_api('vod', auth_token, video_id, url) media = vod['videos']['video'][0]['media'] formats = [] hls_url = media.get('hls', {}).get('url') if hls_url: formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) hds_url = media.get('hds', {}).get('url') if hds_url: formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False)) for c in media.get('content', []): c_url = c.get('url') if not c_url: continue tbr = int_or_none(c.get('bitrate'), 1000) format_id = 'http' if tbr: format_id += '-%d' % tbr formats.append({ 'format_id': format_id, 'url': c_url, 'tbr': tbr, 'width': int_or_none(c.get('width')), 'height': int_or_none(c.get('height')), 'filesize': int_or_none(c.get('fileSize')), 'vcodec': c.get('type'), 'fps': int_or_none(c.get('videoFrameRate')), 'vbr': int_or_none(c.get('videoRate'), 1000), 'abr': int_or_none(c.get('audioRate'), 1000), }) self._sort_formats(formats) return { 'id': video_id, 'title': video['title'], 'description': video.get('description'), 'thumbnail': video.get('poster'), 'duration': int_or_none(video.get('duration')), 'timestamp': int_or_none(video.get('publishedTime'), 1000), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/periscope.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, unescapeHTML, ) class PeriscopeBaseIE(InfoExtractor): _M3U8_HEADERS = { 'Referer': 'https://www.periscope.tv/' } def _call_api(self, method, query, item_id): return self._download_json( 'https://api.periscope.tv/api/v2/%s' % method, item_id, query=query) def _parse_broadcast_data(self, broadcast, video_id): title = broadcast.get('status') or 'Periscope Broadcast' uploader = broadcast.get('user_display_name') or broadcast.get('username') title = '%s - %s' % (uploader, title) if uploader else title is_live = broadcast.get('state').lower() == 'running' thumbnails = [{ 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] return { 'id': broadcast.get('id') or video_id, 'title': self._live_title(title) if is_live else title, 'timestamp': parse_iso8601(broadcast.get('created_at')), 'uploader': uploader, 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), 'thumbnails': thumbnails, 'view_count': int_or_none(broadcast.get('total_watched')), 'tags': broadcast.get('tags'), 'is_live': is_live, } @staticmethod def _extract_common_format_info(broadcast): return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) @staticmethod def _add_width_and_height(f, width, height): for key, val in (('width', width), ('height', height)): if not f.get(key): f[key] = val def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native' if state in ('ended', 'timed_out') else 'm3u8', m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS) if len(m3u8_formats) == 1: self._add_width_and_height(m3u8_formats[0], width, height) for f in m3u8_formats: f.setdefault('http_headers', {}).update(self._M3U8_HEADERS) return m3u8_formats class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', 'info_dict': { 'id': '56102209', 'ext': 'mp4', 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', 'timestamp': 1438978559, 'upload_date': '20150807', 'uploader': 'Bec Boop', 'uploader_id': '1465763', }, 'skip': 'Expires in 24 hours', }, { 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', 'only_matching': True, }, { 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', 'only_matching': True, }, { 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): token = self._match_id(url) stream = self._call_api( 'accessVideoPublic', {'broadcast_id': token}, token) broadcast = stream['broadcast'] info = self._parse_broadcast_data(broadcast, token) state = broadcast.get('state').lower() width = int_or_none(broadcast.get('width')) height = int_or_none(broadcast.get('height')) def add_width_and_height(f): for key, val in (('width', width), ('height', height)): if not f.get(key): f[key] = val video_urls = set() formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): video_url = stream.get(format_id + '_url') if not video_url or video_url in video_urls: continue video_urls.add(video_url) if format_id != 'rtmp': m3u8_formats = self._extract_pscp_m3u8_formats( video_url, token, format_id, state, width, height, False) formats.extend(m3u8_formats) continue rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } self._add_width_and_height(rtmp_format) formats.append(rtmp_format) self._sort_formats(formats) info['formats'] = formats return info class PeriscopeUserIE(PeriscopeBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' _TEST = { 'url': 'https://www.periscope.tv/LularoeHusbandMike/', 'info_dict': { 'id': 'LularoeHusbandMike', 'title': 'LULAROE HUSBAND MIKE', 'description': 'md5:6cf4ec8047768098da58e446e82c82f0', }, # Periscope only shows videos in the last 24 hours, so it's possible to # get 0 videos 'playlist_mincount': 0, } def _real_extract(self, url): user_name = self._match_id(url) webpage = self._download_webpage(url, user_name) data_store = self._parse_json( unescapeHTML(self._search_regex( r'data-store=(["\'])(?P<data>.+?)\1', webpage, 'data store', default='{}', group='data')), user_name) user = list(data_store['UserCache']['users'].values())[0]['user'] user_id = user['id'] session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] broadcasts = self._call_api( 'getUserBroadcastsPublic', {'user_id': user_id, 'session_id': session_id}, user_name)['broadcasts'] broadcast_ids = [ broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] title = user.get('display_name') or user.get('username') or user_name description = user.get('description') entries = [ self.url_result( 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) ================================================ FILE: youtube_dl/extractor/philharmoniedeparis.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( try_get, urljoin, ) class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' _VALID_URL = r'''(?x) https?:// (?: live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| pad\.philharmoniedeparis\.fr/doc/CIMU/ ) (?P<id>\d+) ''' _TESTS = [{ 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', 'md5': 'a0a4b195f544645073631cbec166a2c2', 'info_dict': { 'id': '1086697', 'ext': 'mp4', 'title': 'Jazz à la Villette : Knower', }, }, { 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { 'id': '1032066', 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', }, 'playlist_mincount': 2, }, { 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', 'only_matching': True, }, { 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', 'only_matching': True, }, { 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', 'only_matching': True, }, { 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', 'only_matching': True, }] _LIVE_URL = 'https://live.philharmoniedeparis.fr' def _real_extract(self, url): video_id = self._match_id(url) config = self._download_json( '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ 'id': video_id, 'lang': 'fr-FR', }) def extract_entry(source): if not isinstance(source, dict): return title = source.get('title') if not title: return files = source.get('files') if not isinstance(files, dict): return format_urls = set() formats = [] for format_id in ('mobile', 'desktop'): format_url = try_get( files, lambda x: x[format_id]['file'], compat_str) if not format_url or format_url in format_urls: continue format_urls.add(format_url) m3u8_url = urljoin(self._LIVE_URL, format_url) formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) if not formats: return self._sort_formats(formats) return { 'title': title, 'formats': formats, } thumbnail = urljoin(self._LIVE_URL, config.get('image')) info = extract_entry(config) if info: info.update({ 'id': video_id, 'thumbnail': thumbnail, }) return info entries = [] for num, chapter in enumerate(config['chapters'], start=1): entry = extract_entry(chapter) entry['id'] = '%s-%d' % (video_id, num) entries.append(entry) return self.playlist_result(entries, video_id, config.get('title')) ================================================ FILE: youtube_dl/extractor/phoenix.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .youtube import YoutubeIE from .zdf import ZDFBaseIE from ..compat import compat_str from ..utils import ( int_or_none, merge_dicts, try_get, unified_timestamp, urljoin, ) class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html' _TESTS = [{ # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', 'md5': '34ec321e7eb34231fd88616c65c92db0', 'info_dict': { 'id': '210222_phx_nachgehakt_corona_protest', 'ext': 'mp4', 'title': 'Wohin führt der Protest in der Pandemie?', 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', 'duration': 1691, 'timestamp': 1613902500, 'upload_date': '20210221', 'uploader': 'Phoenix', 'series': 'corona nachgehakt', 'episode': 'Wohin führt der Protest in der Pandemie?', }, }, { # Youtube embed 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', 'info_dict': { 'id': 'hMQtqFYjomk', 'ext': 'mp4', 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', 'duration': 3509, 'upload_date': '20201219', 'uploader': 'phoenix', 'uploader_id': 'phoenix', }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', 'only_matching': True, }, { # no media 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', 'only_matching': True, }, { # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', 'only_matching': True, }] def _real_extract(self, url): article_id = self._match_id(url) article = self._download_json( 'https://www.phoenix.de/response/id/%s' % article_id, article_id, 'Downloading article JSON') video = article['absaetze'][0] title = video.get('titel') or article.get('subtitel') if video.get('typ') == 'video-youtube': video_id = video['id'] return self.url_result( video_id, ie=YoutubeIE.ie_key(), video_id=video_id, video_title=title) video_id = compat_str(video.get('basename') or video.get('content')) details = self._download_json( 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', video_id, 'Downloading details JSON', query={ 'ak': 'web', 'ptmd': 'true', 'id': video_id, 'profile': 'player2', }) title = title or details['title'] content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, content_id, None, url) duration = int_or_none(try_get( details, lambda x: x['tracking']['nielsen']['content']['length'])) timestamp = unified_timestamp(details.get('editorialDate')) series = try_get( details, lambda x: x['tracking']['nielsen']['content']['program'], compat_str) episode = title if details.get('contentType') == 'episode' else None thumbnails = [] teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} for thumbnail_key, thumbnail_url in teaser_images.items(): thumbnail_url = urljoin(url, thumbnail_url) if not thumbnail_url: continue thumbnail = { 'url': thumbnail_url, } m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) if m: thumbnail['width'] = int(m.group(1)) thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return merge_dicts(info, { 'id': content_id, 'title': title, 'description': details.get('leadParagraph'), 'duration': duration, 'thumbnails': thumbnails, 'timestamp': timestamp, 'uploader': details.get('tvService'), 'series': series, 'episode': episode, }) ================================================ FILE: youtube_dl/extractor/photobucket.py ================================================ from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', 'info_dict': { 'id': 'zpsc0c3b9fa', 'ext': 'mp4', 'timestamp': 1367669341, 'upload_date': '20130504', 'uploader': 'rachaneronas', 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') video_extension = mobj.group('ext') webpage = self._download_webpage(url, video_id) # Extract URL, uploader, and title from webpage self.report_extraction(video_id) info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', webpage, 'info json') info = json.loads(info_json) url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) return { 'id': video_id, 'url': url, 'uploader': info['username'], 'timestamp': info['creationDate'], 'title': info['title'], 'ext': video_extension, 'thumbnail': info['thumbUrl'], } ================================================ FILE: youtube_dl/extractor/picarto.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, js_to_json, ) class PicartoIE(InfoExtractor): _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { 'id': 'Setz', 'ext': 'mp4', 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'timestamp': int, 'is_live': True }, 'skip': 'Stream is offline', } @classmethod def suitable(cls, url): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): channel_id = self._match_id(url) data = self._download_json( 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ 'query': '''{ channel(name: "%s") { adult id online stream_name title } getLoadBalancerUrl(channel_name: "%s") { url } }''' % (channel_id, channel_id), })['data'] metadata = data['channel'] if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) title = metadata['title'] cdn_data = self._download_json( data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', channel_id, 'Downloading load balancing info') formats = [] for source in (cdn_data.get('source') or []): source_url = source.get('url') if not source_url: continue source_type = source.get('type') if source_type == 'html5/application/vnd.apple.mpegurl': formats.extend(self._extract_m3u8_formats( source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) elif source_type == 'html5/video/mp4': formats.append({ 'url': source_url, }) self._sort_formats(formats) mature = metadata.get('adult') if mature is None: age_limit = None else: age_limit = 18 if mature is True else 0 return { 'id': channel_id, 'title': self._live_title(title.strip()), 'is_live': True, 'channel': channel_id, 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, } class PicartoVodIE(InfoExtractor): _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', 'info_dict': { 'id': 'ArtofZod_2017.12.12.00.13.23.flv', 'ext': 'mp4', 'title': 'ArtofZod_2017.12.12.00.13.23.flv', 'thumbnail': r're:^https?://.*\.jpg' }, }, { 'url': 'https://picarto.tv/videopopout/Plague', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) vod_info = self._parse_json( self._search_regex( r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, video_id), video_id, transform_source=js_to_json) formats = self._extract_m3u8_formats( vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) return { 'id': video_id, 'title': video_id, 'thumbnail': vod_info.get('vodThumb'), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/piksel.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( dict_get, ExtractorError, int_or_none, parse_iso8601, try_get, unescapeHTML, ) class PikselIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: (?: player\. (?: olympusattelecom| vibebyvista )| (?:api|player)\.multicastmedia| (?:api-ovp|player)\.piksel )\.com| (?: mz-edge\.stream\.co| movie-s\.nhk\.or )\.jp| vidego\.baltimorecity\.gov )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', 'md5': '34e34c8d89dc2559976a6079db531e85', 'info_dict': { 'id': 'ums2867l', 'ext': 'mp4', 'title': 'GX-005 with Caption', 'timestamp': 1481335659, 'upload_date': '20161210' } }, { # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al 'url': 'https://player.piksel.com/v/v80kqp41', 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', 'info_dict': { 'id': 'v80kqp41', 'ext': 'mp4', 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', 'timestamp': 1486171129, 'upload_date': '20170204' } }, { # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', 'only_matching': True, } ] @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', webpage) if mobj: return mobj.group('url') def _call_api(self, app_token, resource, display_id, query, fatal=True): response = (self._download_json( 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), display_id, query=query, fatal=fatal) or {}).get('response') failure = try_get(response, lambda x: x['failure']['reason']) if failure: if fatal: raise ExtractorError(failure, expected=True) self.report_warning(failure) return response def _real_extract(self, url): ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} program = self._call_api( app_token, 'program', display_id, query)['WsProgramResponse']['program'] video_id = program['uuid'] video_data = program['asset'] title = video_data['title'] asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] def process_asset_file(asset_file): if not asset_file: return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) if asset_type == 'video': tbr = vbr + abr elif asset_type == 'audio': tbr = abr format_id = ['http'] if tbr: format_id.append(compat_str(tbr)) formats.append({ 'format_id': '-'.join(format_id), 'url': unescapeHTML(http_url), 'vbr': vbr, 'abr': abr, 'width': int_or_none(asset_file.get('videoWidth')), 'height': int_or_none(asset_file.get('videoHeight')), 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) def process_asset_files(asset_files): for asset_file in (asset_files or []): process_asset_file(asset_file) process_asset_files(video_data.get('assetFiles')) process_asset_file(video_data.get('referenceFile')) if not formats: asset_id = video_data.get('assetid') or program.get('assetid') if asset_id: process_asset_files(try_get(self._call_api( app_token, 'asset_file', display_id, { 'assetid': asset_id, }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) m3u8_url = dict_get(video_data, [ 'm3u8iPadURL', 'ipadM3u8Url', 'm3u8AndroidURL', 'm3u8iPhoneURL', 'iphoneM3u8Url']) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) if smil_url: transform_source = None if ref_id == 'nhkworld': # TODO: figure out if this is something to be fixed in urljoin, # _parse_smil_formats or keep it here transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') formats.extend(self._extract_smil_formats( re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) self._sort_formats(formats) subtitles = {} for caption in video_data.get('captions', []): caption_url = caption.get('url') if caption_url: subtitles.setdefault(caption.get('locale', 'en'), []).append({ 'url': caption_url}) return { 'id': video_id, 'title': title, 'description': video_data.get('description'), 'thumbnail': video_data.get('thumbnailUrl'), 'timestamp': parse_iso8601(video_data.get('dateadd')), 'formats': formats, 'subtitles': subtitles, } ================================================ FILE: youtube_dl/extractor/pinkbike.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, remove_end, remove_start, str_to_int, unified_strdate, ) class PinkbikeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.pinkbike.com/video/402811/', 'md5': '4814b8ca7651034cd87e3361d5c2155a', 'info_dict': { 'id': '402811', 'ext': 'mp4', 'title': 'Brandon Semenuk - RAW 100', 'description': 'Official release: www.redbull.ca/rupertwalker', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 100, 'upload_date': '20150406', 'uploader': 'revelco', 'location': 'Victoria, British Columbia, Canada', 'view_count': int, 'comment_count': int, } }, { 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.pinkbike.com/video/%s' % video_id, video_id) formats = [] for _, format_id, src in re.findall( r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ 'url': src, 'format_id': format_id, 'height': height, }) self._sort_formats(formats) title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') description = self._html_search_regex( r'(?s)id="media-description"[^>]*>(.+?)<', webpage, 'description', default=None) or remove_start( self._og_search_description(webpage), title + '. ') thumbnail = self._og_search_thumbnail(webpage) duration = int_or_none(self._html_search_meta( 'video:duration', webpage, 'duration')) uploader = self._search_regex( r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._search_regex( r'class="fullTime"[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)) location = self._html_search_regex( r'(?s)<dt>Location</dt>\s*<dd>(.+?)<', webpage, 'location', fatal=False) def extract_count(webpage, label): return str_to_int(self._search_regex( r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, webpage, label, fatal=False)) view_count = extract_count(webpage, 'Views') comment_count = extract_count(webpage, 'Comments') return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'upload_date': upload_date, 'uploader': uploader, 'location': location, 'view_count': view_count, 'comment_count': comment_count, 'formats': formats } ================================================ FILE: youtube_dl/extractor/pinterest.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, try_get, unified_timestamp, url_or_none, ) class PinterestBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' def _call_api(self, resource, video_id, options): return self._download_json( 'https://www.pinterest.com/resource/%sResource/get/' % resource, video_id, 'Download %s JSON metadata' % resource, query={ 'data': json.dumps({'options': options}) })['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] title = (data.get('title') or data.get('grid_title') or video_id).strip() urls = [] formats = [] duration = None if extract_formats: for format_id, format_dict in data['videos']['video_list'].items(): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) if not format_url or format_url in urls: continue urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'url': format_url, 'format_id': format_id, 'width': int_or_none(format_dict.get('width')), 'height': int_or_none(format_dict.get('height')), 'duration': duration, }) self._sort_formats( formats, field_preference=('height', 'width', 'tbr', 'format_id')) description = data.get('description') or data.get('description_html') or data.get('seo_description') timestamp = unified_timestamp(data.get('created_at')) def _u(field): return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) uploader = _u('full_name') uploader_id = _u('id') repost_count = int_or_none(data.get('repin_count')) comment_count = int_or_none(data.get('comment_count')) categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) tags = data.get('hashtags') thumbnails = [] images = data.get('images') if isinstance(images, dict): for thumbnail_id, thumbnail in images.items(): if not isinstance(thumbnail, dict): continue thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ 'url': thumbnail_url, 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, 'repost_count': repost_count, 'comment_count': comment_count, 'categories': categories, 'tags': tags, 'formats': formats, 'extractor_key': PinterestIE.ie_key(), } class PinterestIE(PinterestBaseIE): _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.pinterest.com/pin/664281013778109217/', 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', 'info_dict': { 'id': '664281013778109217', 'ext': 'mp4', 'title': 'Origami', 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', 'duration': 57.7, 'timestamp': 1593073622, 'upload_date': '20200625', 'uploader': 'Love origami -I am Dafei', 'uploader_id': '586523688879454212', 'repost_count': 50, 'comment_count': 0, 'categories': list, 'tags': list, }, }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) data = self._call_api( 'Pin', video_id, { 'field_set_key': 'unauth_react_main_pin', 'id': video_id, })['data'] return self._extract_video(data) class PinterestCollectionIE(PinterestBaseIE): _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 'info_dict': { 'id': '585890301462791043', 'title': 'cool diys', }, 'playlist_count': 8, }, { 'url': 'https://www.pinterest.ca/fudohub/videos/', 'info_dict': { 'id': '682858430939307450', 'title': 'VIDEOS', }, 'playlist_mincount': 365, 'skip': 'Test with extract_formats=False', }] @classmethod def suitable(cls, url): return False if PinterestIE.suitable(url) else super( PinterestCollectionIE, cls).suitable(url) def _real_extract(self, url): username, slug = re.match(self._VALID_URL, url).groups() board = self._call_api( 'Board', slug, { 'slug': slug, 'username': username })['data'] board_id = board['id'] options = { 'board_id': board_id, 'page_size': 250, } bookmark = None entries = [] while True: if bookmark: options['bookmarks'] = [bookmark] board_feed = self._call_api('BoardFeed', board_id, options) for item in (board_feed.get('data') or []): if not isinstance(item, dict) or item.get('type') != 'pin': continue video_id = item.get('id') if video_id: # Some pins may not be available anonymously via pin URL # video = self._extract_video(item, extract_formats=False) # video.update({ # '_type': 'url_transparent', # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, # }) # entries.append(video) entries.append(self._extract_video(item)) bookmark = board_feed.get('bookmark') if not bookmark: break return self.playlist_result( entries, playlist_id=board_id, playlist_title=board.get('name')) ================================================ FILE: youtube_dl/extractor/pladform.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( determine_ext, ExtractorError, int_or_none, xpath_text, qualities, ) class PladformIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: (?: out\.pladform\.ru/player| static\.pladform\.ru/player\.swf ) \?.*\bvideoid=| video\.pladform\.ru/catalog/video/videoid/ ) (?P<id>\d+) ''' _TESTS = [{ 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { 'id': '3777899', 'ext': 'mp4', 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 3190, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', 'only_matching': True, }, { 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0', 'only_matching': True, }] @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) pl = qs.get('pl', ['1'])[0] video = self._download_xml( 'http://out.pladform.ru/getVideo', video_id, query={ 'pl': pl, 'videoid': video_id, }) def fail(text): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, text), expected=True) if video.tag == 'error': fail(video.text) quality = qualities(('ld', 'sd', 'hd')) formats = [] for src in video.findall('./src'): if src is None: continue format_url = src.text if not format_url: continue if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': src.text, 'format_id': src.get('quality'), 'quality': quality(src.get('quality')), }) if not formats: error = xpath_text(video, './cap', 'error', default=None) if error: fail(error) self._sort_formats(formats) webpage = self._download_webpage( 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, video_id) title = self._og_search_title(webpage, fatal=False) or xpath_text( video, './/title', 'title', fatal=True) description = self._search_regex( r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) or xpath_text( video, './/cover', 'cover') duration = int_or_none(xpath_text(video, './/time', 'duration')) age_limit = int_or_none(xpath_text(video, './/age18', 'age limit')) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/platzi.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_str, ) from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, url_or_none, urlencode_postdata, urljoin, ) class PlatziBaseIE(InfoExtractor): _LOGIN_URL = 'https://platzi.com/login/' _NETRC_MACHINE = 'platzi' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ 'email': username, 'password': password, }) urlh = self._request_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Referer': self._LOGIN_URL}) # login succeeded if 'platzi.com/login' not in urlh.geturl(): return login_error = self._webpage_read_content( urlh, self._LOGIN_URL, None, 'Downloading login error page') login = self._parse_json( self._search_regex( r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), None) for kind in ('error', 'password', 'nonFields'): error = str_or_none(login.get('%sError' % kind)) if error: raise ExtractorError( 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') class PlatziIE(PlatziBaseIE): _VALID_URL = r'''(?x) https?:// (?: platzi\.com/clases| # es version courses\.platzi\.com/classes # en version )/[^/]+/(?P<id>\d+)-[^/?\#&]+ ''' _TESTS = [{ 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', 'md5': '8f56448241005b561c10f11a595b37e3', 'info_dict': { 'id': '12074', 'ext': 'mp4', 'title': 'Creando nuestra primera página', 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', 'duration': 420, }, 'skip': 'Requires platzi account credentials', }, { 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', 'info_dict': { 'id': '13430', 'ext': 'mp4', 'title': 'Background', 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', 'duration': 360, }, 'skip': 'Requires platzi account credentials', 'params': { 'skip_download': True, }, }] def _real_extract(self, url): lecture_id = self._match_id(url) webpage = self._download_webpage(url, lecture_id) data = self._parse_json( self._search_regex( # client_data may contain "};" so that we have to try more # strict regex first (r'client_data\s*=\s*({.+?})\s*;\s*\n', r'client_data\s*=\s*({.+?})\s*;'), webpage, 'client data'), lecture_id) material = data['initialState']['material'] desc = material['description'] title = desc['title'] formats = [] for server_id, server in material['videos'].items(): if not isinstance(server, dict): continue for format_id in ('hls', 'dash'): format_url = url_or_none(server.get(format_id)) if not format_url: continue if format_id == 'hls': formats.extend(self._extract_m3u8_formats( format_url, lecture_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, note='Downloading %s m3u8 information' % server_id, fatal=False)) elif format_id == 'dash': formats.extend(self._extract_mpd_formats( format_url, lecture_id, mpd_id=format_id, note='Downloading %s MPD manifest' % server_id, fatal=False)) self._sort_formats(formats) content = str_or_none(desc.get('content')) description = (clean_html(compat_b64decode(content).decode('utf-8')) if content else None) duration = int_or_none(material.get('duration'), invscale=60) return { 'id': lecture_id, 'title': title, 'description': description, 'duration': duration, 'formats': formats, } class PlatziCourseIE(PlatziBaseIE): _VALID_URL = r'''(?x) https?:// (?: platzi\.com/clases| # es version courses\.platzi\.com/classes # en version )/(?P<id>[^/?\#&]+) ''' _TESTS = [{ 'url': 'https://platzi.com/clases/next-js/', 'info_dict': { 'id': '1311', 'title': 'Curso de Next.js', }, 'playlist_count': 22, }, { 'url': 'https://courses.platzi.com/classes/communication-codestream/', 'info_dict': { 'id': '1367', 'title': 'Codestream Course', }, 'playlist_count': 14, }] @classmethod def suitable(cls, url): return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) def _real_extract(self, url): course_name = self._match_id(url) webpage = self._download_webpage(url, course_name) props = self._parse_json( self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), course_name)['initialProps'] entries = [] for chapter_num, chapter in enumerate(props['concepts'], 1): if not isinstance(chapter, dict): continue materials = chapter.get('materials') if not materials or not isinstance(materials, list): continue chapter_title = chapter.get('title') chapter_id = str_or_none(chapter.get('id')) for material in materials: if not isinstance(material, dict): continue if material.get('material_type') != 'video': continue video_url = urljoin(url, material.get('url')) if not video_url: continue entries.append({ '_type': 'url_transparent', 'url': video_url, 'title': str_or_none(material.get('name')), 'id': str_or_none(material.get('id')), 'ie_key': PlatziIE.ie_key(), 'chapter': chapter_title, 'chapter_number': chapter_num, 'chapter_id': chapter_id, }) course_id = compat_str(try_get(props, lambda x: x['course']['id'])) course_title = try_get(props, lambda x: x['course']['name'], compat_str) return self.playlist_result(entries, course_id, course_title) ================================================ FILE: youtube_dl/extractor/playfm.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, parse_iso8601, ) class PlayFMIE(InfoExtractor): IE_NAME = 'play.fm' _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' _TEST = { 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', 'md5': 'c505f8307825a245d0c7ad1850001f22', 'info_dict': { 'id': '71276', 'ext': 'mp3', 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', 'description': '', 'duration': 5627, 'timestamp': 1406033781, 'upload_date': '20140722', 'uploader': 'Dan Drastic', 'uploader_id': '71170', 'view_count': int, 'comment_count': int, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') slug = mobj.group('slug') recordings = self._download_json( 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) error = recordings.get('error') if isinstance(error, dict): raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, error.get('message')), expected=True) audio_url = recordings['audio'] video_id = compat_str(recordings.get('id') or video_id) title = recordings['title'] description = recordings.get('description') duration = int_or_none(recordings.get('recordingDuration')) timestamp = parse_iso8601(recordings.get('created_at')) uploader = recordings.get('page', {}).get('title') uploader_id = compat_str(recordings.get('page', {}).get('id')) view_count = int_or_none(recordings.get('playCount')) comment_count = int_or_none(recordings.get('commentCount')) categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] return { 'id': video_id, 'url': audio_url, 'title': title, 'description': description, 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, } ================================================ FILE: youtube_dl/extractor/playplustv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( clean_html, ExtractorError, int_or_none, PUTRequest, ) class PlayPlusTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})' _TEST = { 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e', 'md5': 'd078cb89d7ab6b9df37ce23c647aef72', 'info_dict': { 'id': 'db8d274a5163424e967f35a30ddafb8e', 'ext': 'mp4', 'title': 'Capítulo 179 - Final', 'description': 'md5:01085d62d8033a1e34121d3c3cabc838', 'timestamp': 1529992740, 'upload_date': '20180626', }, 'skip': 'Requires account credential', } _NETRC_MACHINE = 'playplustv' _GEO_COUNTRIES = ['BR'] _token = None _profile_id = None def _call_api(self, resource, video_id=None, query=None): return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={ 'Authorization': 'Bearer ' + self._token, }, query=query) def _real_initialize(self): email, password = self._get_login_info() if email is None: self.raise_login_required() req = PUTRequest( 'https://api.playplus.tv/api/web/login', json.dumps({ 'email': email, 'password': password, }).encode(), { 'Content-Type': 'application/json; charset=utf-8', }) try: self._token = self._download_json(req, None)['token'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: raise ExtractorError(self._parse_json( e.cause.read(), None)['errorMessage'], expected=True) raise self._profile = self._call_api('Profiles')['list'][0]['_id'] def _real_extract(self, url): project_id, media_id = re.match(self._VALID_URL, url).groups() media = self._call_api( 'Media', media_id, { 'profileId': self._profile, 'projectId': project_id, 'mediaId': media_id, })['obj'] title = media['title'] formats = [] for f in media.get('files', []): f_url = f.get('url') if not f_url: continue file_info = f.get('fileInfo') or {} formats.append({ 'url': f_url, 'width': int_or_none(file_info.get('width')), 'height': int_or_none(file_info.get('height')), }) self._sort_formats(formats) thumbnails = [] for thumb in media.get('thumbs', []): thumb_url = thumb.get('url') if not thumb_url: continue thumbnails.append({ 'url': thumb_url, 'width': int_or_none(thumb.get('width')), 'height': int_or_none(thumb.get('height')), }) return { 'id': media_id, 'title': title, 'formats': formats, 'thumbnails': thumbnails, 'description': clean_html(media.get('description')) or media.get('shortDescription'), 'timestamp': int_or_none(media.get('publishDate'), 1000), 'view_count': int_or_none(media.get('numberOfViews')), 'comment_count': int_or_none(media.get('numberOfComments')), 'tags': media.get('tags'), } ================================================ FILE: youtube_dl/extractor/plays.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import int_or_none class PlaysTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' _TESTS = [{ 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', 'md5': 'dfeac1198506652b5257a62762cec7bc', 'info_dict': { 'id': '56af17f56c95335490', 'ext': 'mp4', 'title': 'Bjergsen - When you outplay the Azir wall', 'description': 'Posted by Bjergsen', } }, { 'url': 'https://plays.tv/embeds/56af17f56c95335490', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://plays.tv/video/%s' % video_id, video_id) info = self._search_json_ld(webpage, video_id,) mpd_url, sources = re.search( r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', webpage).groups() formats = self._extract_mpd_formats( self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): formats.append({ 'url': self._proto_relative_url(format_url), 'format_id': 'http-' + format_id, 'height': int_or_none(height), }) self._sort_formats(formats) info.update({ 'id': video_id, 'description': self._og_search_description(webpage), 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), 'formats': formats, }) return info ================================================ FILE: youtube_dl/extractor/playstuff.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( smuggle_url, try_get, ) class PlayStuffIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a', 'md5': 'c82d3669e5247c64bc382577843e5bd0', 'info_dict': { 'id': '6250584958001', 'ext': 'mp4', 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga', 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913', 'uploader_id': '6005208634001', 'timestamp': 1619491027, 'upload_date': '20210427', }, 'add_ie': ['BrightcoveNew'], }, { # geo restricted, bypassable 'url': 'https://play.stuff.co.nz/details/_6155660351001', 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) state = self._parse_json( self._search_regex( r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'), video_id) account_id = try_get( state, lambda x: x['configurations']['accountId'], compat_str) or '6005208634001' player_id = try_get( state, lambda x: x['configurations']['playerId'], compat_str) or 'default' entries = [] for item_id, video in state['items'].items(): if not isinstance(video, dict): continue asset_id = try_get( video, lambda x: x['content']['attributes']['assetId'], compat_str) if not asset_id: continue entries.append(self.url_result( smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id), {'geo_countries': ['NZ']}), 'BrightcoveNew', video_id)) return self.playlist_result(entries, video_id) ================================================ FILE: youtube_dl/extractor/playtvak.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, int_or_none, parse_iso8601, qualities, ) class PlaytvakIE(InfoExtractor): IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' _TESTS = [{ 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', 'md5': '4525ae312c324b4be2f4603cc78ceb4a', 'info_dict': { 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', 'description': 'md5:4436e61b7df227a093778efb7e373571', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, 'upload_date': '20150805', 'is_live': False, } }, { # live video test 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', 'is_live': True, }, 'params': { 'skip_download': True, # requires rtmpdump }, }, { # another live stream, this one without Misc.videoFLV 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', 'info_dict': { 'id': 'A151218_145728_hlavni-nadrazi_plap', 'ext': 'flv', 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': True, # requires rtmpdump }, }, { # idnes.cz 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', 'md5': '819832ba33cd7016e58a6658577fe289', 'info_dict': { 'id': 'A150809_104116_domaci_pku', 'ext': 'mp4', 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 39, 'timestamp': 1438969140, 'upload_date': '20150807', 'is_live': False, } }, { # lidovky.cz 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', 'info_dict': { 'id': 'A150808_214044_ln-video_ELE', 'ext': 'mp4', 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1439052180, 'upload_date': '20150808', 'is_live': False, } }, { # metro.cz 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', 'md5': '84fc1deedcac37b7d4a6ccae7c716668', 'info_dict': { 'id': 'A141111_173251_metro-extra_row', 'ext': 'mp4', 'title': 'Recesisté udělali z billboardu kolotoč', 'description': 'md5:7369926049588c3989a66c9c1a043c4c', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'timestamp': 1415725500, 'upload_date': '20141111', 'is_live': False, } }, { 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) qs = compat_urlparse.parse_qs(parsed_url.query) qs.update({ 'reklama': ['0'], 'type': ['js'], }) info_url = compat_urlparse.urlunparse( parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) json_info = self._download_json( info_url, video_id, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) item = None for i in json_info['items']: if i.get('type') == 'video' or i.get('type') == 'stream': item = i break if not item: raise ExtractorError('No suitable stream found') quality = qualities(('low', 'middle', 'high')) formats = [] for fmt in item['video']: video_url = fmt.get('file') if not video_url: continue format_ = fmt['format'] format_id = '%s_%s' % (format_, fmt['quality']) preference = None if format_ in ('mp4', 'webm'): ext = format_ elif format_ == 'rtmp': ext = 'flv' elif format_ == 'apple': ext = 'mp4' # Some streams have mp3 audio which does not play # well with ffmpeg filter aac_adtstoasc preference = -1 elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests continue else: # Other formats not supported yet continue formats.append({ 'url': video_url, 'ext': ext, 'format_id': format_id, 'quality': quality(fmt.get('quality')), 'preference': preference, }) self._sort_formats(formats) title = item['title'] is_live = item['type'] == 'stream' if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'description', webpage, 'description', default=None) timestamp = None duration = None if not is_live: duration = int_or_none(item.get('length')) timestamp = item.get('published') if timestamp: timestamp = parse_iso8601(timestamp[:-5]) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': item.get('image'), 'duration': duration, 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/playvid.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) from ..utils import ( clean_html, ExtractorError, ) class PlayvidIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { 'id': 'RnmBNgtrrJu', 'ext': 'mp4', 'title': 'md5:9256d01c6317e3f703848b5906880dc8', 'duration': 82, 'age_limit': 18, }, 'skip': 'Video removed due to ToS', }, { 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', 'md5': '39d49df503ad7b8f23a4432cbf046477', 'info_dict': { 'id': 'hwb0GpNkzgH', 'ext': 'mp4', 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', 'age_limit': 18, 'thumbnail': r're:^https?://.*\.jpg$', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) m_error = re.search( r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage) if m_error: raise ExtractorError(clean_html(m_error.group('msg')), expected=True) video_title = None duration = None video_thumbnail = None formats = [] # most of the information is stored in the flashvars flashvars = self._html_search_regex( r'flashvars="(.+?)"', webpage, 'flashvars') infos = compat_urllib_parse_unquote(flashvars).split(r'&') for info in infos: videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) if videovars_match: key = videovars_match.group(1) val = videovars_match.group(2) if key == 'title': video_title = compat_urllib_parse_unquote_plus(val) if key == 'duration': try: duration = int(val) except ValueError: pass if key == 'big_thumb': video_thumbnail = val videourl_match = re.match( r'^video_urls\]\[(?P<resolution>[0-9]+)p', key) if videourl_match: height = int(videourl_match.group('resolution')) formats.append({ 'height': height, 'url': val, }) self._sort_formats(formats) # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: video_title = self._html_search_regex( r'<title>(.*?)</title', webpage, 'title') return { 'id': video_id, 'formats': formats, 'title': video_title, 'thumbnail': video_thumbnail, 'duration': duration, 'description': None, 'age_limit': 18 } ================================================ FILE: youtube_dl/extractor/playwire.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( dict_get, float_or_none, ) class PlaywireIE(InfoExtractor): _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', 'md5': 'e6398701e3595888125729eaa2329ed9', 'info_dict': { 'id': '3353705', 'ext': 'mp4', 'title': 'S04_RM_UCL_Rus', 'thumbnail': r're:^https?://.*\.png$', 'duration': 145.94, }, }, { # m3u8 in f4m 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json', 'info_dict': { 'id': '4840492', 'ext': 'mp4', 'title': 'ITV EL SHOW FULL', }, 'params': { # m3u8 download 'skip_download': True, }, }, { # Multiple resolutions while bitrates missing 'url': 'http://cdn.playwire.com/11625/embed/85228.html', 'only_matching': True, }, { 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json', 'only_matching': True, }, { 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') player = self._download_json( 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), video_id) title = player['settings']['title'] duration = float_or_none(player.get('duration'), 1000) content = player['content'] thumbnail = content.get('poster') src = content['media']['f4m'] formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls') for a_format in formats: if not dict_get(a_format, ['tbr', 'width', 'height']): a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/pluralsight.py ================================================ from __future__ import unicode_literals import collections import json import os import random import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urlparse, ) from ..utils import ( dict_get, ExtractorError, float_or_none, int_or_none, parse_duration, qualities, srt_subtitles_timecode, try_get, update_url_query, urlencode_postdata, ) class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE _GRAPHQL_HEADERS = { 'Content-Type': 'application/json;charset=UTF-8', } _GRAPHQL_COURSE_TMPL = ''' query BootstrapPlayer { rpc { bootstrapPlayer { profile { firstName lastName email username userHandle authed isAuthed plan } course(courseId: "%s") { name title courseHasCaptions translationLanguages { code name } supportsWideScreenVideoFormats timestamp modules { name title duration formattedDuration author authorized clips { authorized clipId duration formattedDuration id index moduleIndex moduleTitle name title watched } } } } } }''' def _download_course(self, course_id, url, display_id): try: return self._download_course_rpc(course_id, url, display_id) except ExtractorError: # Old API fallback return self._download_json( 'https://app.pluralsight.com/player/user/api/v1/player/payload', display_id, data=urlencode_postdata({'courseId': course_id}), headers={'Referer': url}) def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( self._GRAPHQL_EP, display_id, data=json.dumps({ 'query': self._GRAPHQL_COURSE_TMPL % course_id, 'variables': {} }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) course = try_get( response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], dict) if course: return course raise ExtractorError( '%s said: %s' % (self.IE_NAME, response['error']['message']), expected=True) class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?' _LOGIN_URL = 'https://app.pluralsight.com/id/' _NETRC_MACHINE = 'pluralsight' _TESTS = [{ 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', 'md5': '4d458cf5cf4c593788672419a8dd4cf8', 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 'ext': 'mp4', 'title': 'Demo Monitoring', 'duration': 338, }, 'skip': 'Requires pluralsight account credentials', }, { 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live', 'only_matching': True, }, { # available without pluralsight account 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started', 'only_matching': True, }, { 'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0', 'only_matching': True, }] GRAPHQL_VIEWCLIP_TMPL = ''' query viewClip { viewClip(input: { author: "%(author)s", clipIndex: %(clipIndex)d, courseName: "%(courseName)s", includeCaptions: %(includeCaptions)s, locale: "%(locale)s", mediaType: "%(mediaType)s", moduleName: "%(moduleName)s", quality: "%(quality)s" }) { urls { url cdn rank source }, status } }''' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ 'Username': username, 'Password': password, }) post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Content-Type': 'application/x-www-form-urlencoded'}) error = self._search_regex( r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', response, 'error message', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) if all(not re.search(p, response) for p in ( r'__INITIAL_STATE__', r'["\']currentUser["\']', # new layout? r'>\s*Sign out\s*<')): BLOCKED = 'Your account has been blocked due to suspicious activity' if BLOCKED in response: raise ExtractorError( 'Unable to login: %s' % BLOCKED, expected=True) MUST_AGREE = 'To continue using Pluralsight, you must agree to' if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): raise ExtractorError( 'Unable to login: %s some documents. Go to pluralsight.com, ' 'log in and agree with what Pluralsight requires.' % MUST_AGREE, expected=True) raise ExtractorError('Unable to log in') def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id): captions = None if clip_id: captions = self._download_json( '%s/transcript/api/v1/caption/json/%s/%s' % (self._API_BASE, clip_id, lang), video_id, 'Downloading captions JSON', 'Unable to download captions JSON', fatal=False) if not captions: captions_post = { 'a': author, 'cn': int(clip_idx), 'lc': lang, 'm': name, } captions = self._download_json( '%s/player/retrieve-captions' % self._API_BASE, video_id, 'Downloading captions JSON', 'Unable to download captions JSON', fatal=False, data=json.dumps(captions_post).encode('utf-8'), headers={'Content-Type': 'application/json;charset=utf-8'}) if captions: return { lang: [{ 'ext': 'json', 'data': json.dumps(captions), }, { 'ext': 'srt', 'data': self._convert_subtitles(duration, captions), }] } @staticmethod def _convert_subtitles(duration, subs): srt = '' TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset') TEXT_KEYS = ('text', 'Text') for num, current in enumerate(subs): current = subs[num] start, text = ( float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), dict_get(current, TEXT_KEYS)) if start is None or text is None: continue end = duration if num == len(subs) - 1 else float_or_none( dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) if end is None: continue srt += os.linesep.join( ( '%d' % num, '%s --> %s' % ( srt_subtitles_timecode(start), srt_subtitles_timecode(end)), text, os.linesep, )) return srt def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) author = qs.get('author', [None])[0] name = qs.get('name', [None])[0] clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] if any(not f for f in (author, name, clip_idx, course_name,)): raise ExtractorError('Invalid URL', expected=True) display_id = '%s-%s' % (name, clip_idx) course = self._download_course(course_name, url, display_id) collection = course['modules'] clip = None for module_ in collection: if name in (module_.get('moduleName'), module_.get('name')): for clip_ in module_.get('clips', []): clip_index = clip_.get('clipIndex') if clip_index is None: clip_index = clip_.get('index') if clip_index is None: continue if compat_str(clip_index) == clip_idx: clip = clip_ break if not clip: raise ExtractorError('Unable to resolve clip') title = clip['title'] clip_id = clip.get('clipName') or clip.get('name') or clip['clipId'] QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, 'high': {'width': 1024, 'height': 768}, 'high-widescreen': {'width': 1280, 'height': 720}, } QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) quality_key = qualities(QUALITIES_PREFERENCE) AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) ALLOWED_QUALITIES = ( AllowedQuality('webm', ['high', ]), AllowedQuality('mp4', ['low', 'medium', 'high', ]), ) # Some courses also offer widescreen resolution for high quality (see # https://github.com/ytdl-org/youtube-dl/issues/7766) widescreen = course.get('supportsWideScreenVideoFormats') is True best_quality = 'high-widescreen' if widescreen else 'high' if widescreen: for allowed_quality in ALLOWED_QUALITIES: allowed_quality.qualities.append(best_quality) # In order to minimize the number of calls to ViewClip API and reduce # the probability of being throttled or banned by Pluralsight we will request # only single format until formats listing was explicitly requested. if self._downloader.params.get('listformats', False): allowed_qualities = ALLOWED_QUALITIES else: def guess_allowed_qualities(): req_format = self._downloader.params.get('format') or 'best' req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' return (AllowedQuality(req_ext, (best_quality, )), ) allowed_qualities = guess_allowed_qualities() formats = [] for ext, qualities_ in allowed_qualities: for quality in qualities_: f = QUALITIES[quality].copy() clip_post = { 'author': author, 'includeCaptions': 'false', 'clipIndex': int(clip_idx), 'courseName': course_name, 'locale': 'en', 'moduleName': name, 'mediaType': ext, 'quality': '%dx%d' % (f['width'], f['height']), } format_id = '%s-%s' % (ext, quality) try: viewclip = self._download_json( self._GRAPHQL_EP, display_id, 'Downloading %s viewclip graphql' % format_id, data=json.dumps({ 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, 'variables': {} }).encode('utf-8'), headers=self._GRAPHQL_HEADERS)['data']['viewClip'] except ExtractorError: # Still works but most likely will go soon viewclip = self._download_json( '%s/video/clips/viewclip' % self._API_BASE, display_id, 'Downloading %s viewclip JSON' % format_id, fatal=False, data=json.dumps(clip_post).encode('utf-8'), headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842). # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: continue clip_urls = viewclip.get('urls') if not isinstance(clip_urls, list): continue for clip_url_data in clip_urls: clip_url = clip_url_data.get('url') if not clip_url: continue cdn = clip_url_data.get('cdn') clip_f = f.copy() clip_f.update({ 'url': clip_url, 'ext': ext, 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, 'quality': quality_key(quality), 'source_preference': int_or_none(clip_url_data.get('rank')), }) formats.append(clip_f) self._sort_formats(formats) duration = int_or_none( clip.get('duration')) or parse_duration(clip.get('formattedDuration')) # TODO: other languages? subtitles = self.extract_subtitles( author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id) return { 'id': clip_id, 'title': title, 'duration': duration, 'creator': author, 'formats': formats, 'subtitles': subtitles, } class PluralsightCourseIE(PluralsightBaseIE): IE_NAME = 'pluralsight:course' _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)' _TESTS = [{ # Free course from Pluralsight Starter Subscription for Microsoft TechNet # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas', 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', }, 'playlist_count': 31, }, { # available without pluralsight account 'url': 'https://www.pluralsight.com/courses/angularjs-get-started', 'only_matching': True, }, { 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents', 'only_matching': True, }] def _real_extract(self, url): course_id = self._match_id(url) # TODO: PSM cookie course = self._download_course(course_id, url, course_id) title = course['title'] course_name = course['name'] course_data = course['modules'] description = course.get('description') or course.get('shortDescription') entries = [] for num, module in enumerate(course_data, 1): author = module.get('author') module_name = module.get('name') if not author or not module_name: continue for clip in module.get('clips', []): clip_index = int_or_none(clip.get('index')) if clip_index is None: continue clip_url = update_url_query( '%s/player' % self._API_BASE, query={ 'mode': 'live', 'course': course_name, 'author': author, 'name': module_name, 'clip': clip_index, }) entries.append({ '_type': 'url_transparent', 'url': clip_url, 'ie_key': PluralsightIE.ie_key(), 'chapter': module.get('title'), 'chapter_number': num, 'chapter_id': module.get('moduleRef'), }) return self.playlist_result(entries, course_id, title, description) ================================================ FILE: youtube_dl/extractor/podomatic.py ================================================ from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import int_or_none class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' _VALID_URL = r'''(?x) (?P<proto>https?):// (?: (?P<channel>[^.]+)\.podomatic\.com/entry| (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes )/ (?P<id>[^/?#&]+) ''' _TESTS = [{ 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', 'md5': '84bb855fcf3429e6bf72460e1eed782d', 'info_dict': { 'id': '2009-01-02T16_03_35-08_00', 'ext': 'mp3', 'uploader': 'Science Teaching Tips', 'uploader_id': 'scienceteachingtips', 'title': '64. When the Moon Hits Your Eye', 'duration': 446, } }, { 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', 'md5': 'd2cf443931b6148e27638650e2638297', 'info_dict': { 'id': '2013-11-15T16_31_21-08_00', 'ext': 'mp3', 'uploader': 'Ostbahnhof / Techno Mix', 'uploader_id': 'ostbahnhof', 'title': 'Einunddreizig', 'duration': 3799, } }, { 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') channel = mobj.group('channel') or mobj.group('channel_2') json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + '?permalink=true&rtmp=0') % (mobj.group('proto'), channel, video_id)) data_json = self._download_webpage( json_url, video_id, 'Downloading video info') data = json.loads(data_json) video_url = data['downloadLink'] if not video_url: video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] duration = int_or_none(data.get('length'), 1000) return { 'id': video_id, 'url': video_url, 'title': title, 'uploader': uploader, 'uploader_id': channel, 'thumbnail': thumbnail, 'duration': duration, } ================================================ FILE: youtube_dl/extractor/pokemon.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, int_or_none, ) class PokemonIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', 'md5': '2fe8eaec69768b25ef898cda9c43062e', 'info_dict': { 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', 'ext': 'mp4', 'title': 'The Ol’ Raise and Switch!', 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', }, 'add_id': ['LimelightMedia'], }, { # no data-video-title 'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008', 'info_dict': { 'id': 'dfbaf830d7e54e179837c50c0c6cc0e1', 'ext': 'mp4', 'title': "Pokémon : L'ascension de Darkrai", 'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5', }, 'add_id': ['LimelightMedia'], 'params': { 'skip_download': True, }, }, { 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', 'only_matching': True, }, { 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', 'only_matching': True, }, { 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', 'only_matching': True, }] def _real_extract(self, url): video_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id or display_id) video_data = extract_attributes(self._search_regex( r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), webpage, 'video data element')) video_id = video_data['data-video-id'] title = video_data.get('data-video-title') or self._html_search_meta( 'pkm-title', webpage, ' title', default=None) or self._search_regex( r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') return { '_type': 'url_transparent', 'id': video_id, 'url': 'limelight:media:%s' % video_id, 'title': title, 'description': video_data.get('data-video-summary'), 'thumbnail': video_data.get('data-video-poster'), 'series': 'Pokémon', 'season_number': int_or_none(video_data.get('data-video-season')), 'episode': title, 'episode_number': int_or_none(video_data.get('data-video-episode')), 'ie_key': 'LimelightMedia', } ================================================ FILE: youtube_dl/extractor/polskieradio.py ================================================ # coding: utf-8 from __future__ import unicode_literals import itertools import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse_unquote, compat_urlparse ) from ..utils import ( extract_attributes, int_or_none, strip_or_none, unified_timestamp, ) class PolskieRadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { 'id': '1587943', 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', }, 'playlist': [{ 'md5': '2984ee6ce9046d91fc233bc1a864a09a', 'info_dict': { 'id': '1540576', 'ext': 'mp3', 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', 'timestamp': 1456594200, 'upload_date': '20160227', 'duration': 2364, 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], }, { 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', 'info_dict': { 'id': '1635803', 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', }, 'playlist_mincount': 12, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, }, { 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', 'only_matching': True, }, { # with mp4 video 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) content = self._search_regex( r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', webpage, 'content') timestamp = unified_timestamp(self._html_search_regex( r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', webpage, 'timestamp', fatal=False)) thumbnail_url = self._og_search_thumbnail(webpage) entries = [] media_urls = set() for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): media = self._parse_json(data_media, playlist_id, fatal=False) if not media.get('file') or not media.get('desc'): continue media_url = self._proto_relative_url(media['file'], 'http:') if media_url in media_urls: continue media_urls.add(media_url) entries.append({ 'id': compat_str(media['id']), 'url': media_url, 'title': compat_urllib_parse_unquote(media['desc']), 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, 'timestamp': timestamp, 'thumbnail': thumbnail_url }) title = self._og_search_title(webpage).strip() description = strip_or_none(self._og_search_description(webpage)) return self.playlist_result(entries, playlist_id, title, description) class PolskieRadioCategoryIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', 'info_dict': { 'id': '5102', 'title': 'HISTORIA ŻYWA', }, 'playlist_mincount': 38, }, { 'url': 'http://www.polskieradio.pl/7/4807', 'info_dict': { 'id': '4807', 'title': 'Vademecum 1050. rocznicy Chrztu Polski' }, 'playlist_mincount': 5 }, { 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', 'only_matching': True }, { 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 'info_dict': { 'id': '4143', 'title': 'Kierunek Kraków', }, 'playlist_mincount': 61 }, { 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', 'info_dict': { 'id': '214', 'title': 'Muzyka', }, 'playlist_mincount': 61 }, { 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', 'only_matching': True, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) def _entries(self, url, page, category_id): content = page for page_num in itertools.count(2): for a_entry, entry_id in re.findall( r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', content): entry = extract_attributes(a_entry) href = entry.get('href') if not href: continue yield self.url_result( compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), entry_id, entry.get('title')) mobj = re.search( r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', content) if not mobj: break next_url = compat_urlparse.urljoin(url, mobj.group('url')) content = self._download_webpage( next_url, category_id, 'Downloading page %s' % page_num) def _real_extract(self, url): category_id = self._match_id(url) webpage = self._download_webpage(url, category_id) title = self._html_search_regex( r'<title>([^<]+) - [^<]+ - [^<]+', webpage, 'title', fatal=False) return self.playlist_result( self._entries(url, webpage, category_id), category_id, title) ================================================ FILE: youtube_dl/extractor/popcorntimes.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_chr, ) from ..utils import int_or_none class PopcorntimesIE(InfoExtractor): _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P[^/]+)/(?P[^/?#&]+)' _TEST = { 'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy', 'md5': '93f210991ad94ba8c3485950a2453257', 'info_dict': { 'id': 'A1XCFvz', 'display_id': 'haensel-und-gretel-opera-fantasy', 'ext': 'mp4', 'title': 'Hänsel und Gretel', 'description': 'md5:1b8146791726342e7b22ce8125cf6945', 'thumbnail': r're:^https?://.*\.jpg$', 'creator': 'John Paul', 'release_date': '19541009', 'duration': 4260, 'tbr': 5380, 'width': 720, 'height': 540, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id, display_id = mobj.group('id', 'display_id') webpage = self._download_webpage(url, display_id) title = self._search_regex( r'

([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'ya:ovs:original_name', webpage, 'title', fatal=True) loc = self._search_regex( r'PCTMLOC\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'loc', group='value') loc_b64 = '' for c in loc: c_ord = ord(c) if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'): upper = ord('Z') if c_ord <= ord('Z') else ord('z') c_ord += 13 if upper < c_ord: c_ord -= 26 loc_b64 += compat_chr(c_ord) video_url = compat_b64decode(loc_b64).decode('utf-8') description = self._html_search_regex( r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)', webpage, 'description', fatal=False) thumbnail = self._search_regex( r']+class=["\']video-preview[^>]+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', default=None, group='value') or self._og_search_thumbnail(webpage) creator = self._html_search_meta( 'video:director', webpage, 'creator', default=None) release_date = self._html_search_meta( 'video:release_date', webpage, default=None) if release_date: release_date = release_date.replace('-', '') def int_meta(name): return int_or_none(self._html_search_meta( name, webpage, default=None)) return { 'id': video_id, 'display_id': display_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'creator': creator, 'release_date': release_date, 'duration': int_meta('video:duration'), 'tbr': int_meta('ya:ovs:bitrate'), 'width': int_meta('og:video:width'), 'height': int_meta('og:video:height'), 'http_headers': { 'Referer': url, }, } ================================================ FILE: youtube_dl/extractor/popcorntv.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, int_or_none, unified_timestamp, ) class PopcornTVIE(InfoExtractor): _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', 'md5': '47d65a48d147caf692ab8562fe630b45', 'info_dict': { 'id': '9183', 'display_id': 'food-wars-battaglie-culinarie-episodio-01', 'ext': 'mp4', 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1497610857, 'upload_date': '20170616', 'duration': 1440, 'view_count': int, }, }, { 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id, video_id = mobj.group('display_id', 'id') webpage = self._download_webpage(url, display_id) m3u8_url = extract_attributes( self._search_regex( r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', webpage, 'content' ))['href'] formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') title = self._search_regex( r']+itemprop=["\']name[^>]*>([^<]+)', webpage, 'title', default=None) or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)]+itemprop=["\']description[^>]*>(.+?)', webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) timestamp = unified_timestamp(self._html_search_meta( 'uploadDate', webpage, 'timestamp')) duration = int_or_none(self._html_search_meta( 'duration', webpage), invscale=60) view_count = int_or_none(self._html_search_meta( 'interactionCount', webpage, 'view count')) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/porn91.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, ExtractorError, ) class Porn91IE(InfoExtractor): IE_NAME = '91porn' _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P[\w\d]+)' _TEST = { 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', 'md5': '7fcdb5349354f40d41689bd0fa8db05a', 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', 'ext': 'mp4', 'duration': 431, 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('91porn.com', 'language', 'cn_CN') webpage = self._download_webpage( 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) title = self._search_regex( r'
([^<]+)
', webpage, 'title') title = title.replace('\n', '') video_link_url = self._search_regex( r']+id=["\']fm-video_link[^>]+>([^<]+)', webpage, 'video link') videopage = self._download_webpage(video_link_url, video_id) info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0] duration = parse_duration(self._search_regex( r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) comment_count = int_or_none(self._search_regex( r'留言:\s*\s*(\d+)', webpage, 'comment count', fatal=False)) info_dict.update({ 'id': video_id, 'title': title, 'duration': duration, 'comment_count': comment_count, 'age_limit': self._rta_search(webpage), }) return info_dict ================================================ FILE: youtube_dl/extractor/porncom.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, parse_filesize, str_to_int, ) class PornComIE(InfoExtractor): _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', 'md5': '3f30ce76267533cd12ba999263156de7', 'info_dict': { 'id': '2603339', 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', 'ext': 'mp4', 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 551, 'view_count': int, 'age_limit': 18, 'categories': list, 'tags': list, }, }, { 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) config = self._parse_json( self._search_regex( (r'=\s*({.+?})\s*;\s*v1ar\b', r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), webpage, 'config', default='{}'), display_id, transform_source=js_to_json, fatal=False) if config: title = config['title'] formats = [{ 'url': stream['url'], 'format_id': stream.get('id'), 'height': int_or_none(self._search_regex( r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) } for stream in config['streams'] if stream.get('url')] thumbnail = (compat_urlparse.urljoin( config['thumbCDN'], config['poster']) if config.get('thumbCDN') and config.get('poster') else None) duration = int_or_none(config.get('length')) else: title = self._search_regex( (r'([^<]+)', r']*>([^<]+)

'), webpage, 'title') formats = [{ 'url': compat_urlparse.urljoin(url, format_url), 'format_id': '%sp' % height, 'height': int(height), 'filesize_approx': parse_filesize(filesize), } for format_url, height, filesize in re.findall( r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', webpage)] thumbnail = None duration = None self._sort_formats(formats) view_count = str_to_int(self._search_regex( (r'Views:\s*\s*\s*([\d,.]+)', r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, 'view count', fatal=False)) def extract_list(kind): s = self._search_regex( (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), r'(?s)]*>%s:(.+?)

' % kind.capitalize()), webpage, kind, fatal=False) return re.findall(r']+>([^<]+)', s or '') return { 'id': video_id, 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, 'formats': formats, 'age_limit': 18, 'categories': extract_list('categories'), 'tags': extract_list('tags'), } ================================================ FILE: youtube_dl/extractor/pornhd.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, int_or_none, js_to_json, merge_dicts, urljoin, ) class PornHdIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' _TESTS = [{ 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', 'md5': '87f1540746c1d32ec7a2305c12b96b25', 'info_dict': { 'id': '9864', 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', 'ext': 'mp4', 'title': 'Restroom selfie masturbation', 'description': 'md5:3748420395e03e31ac96857a8f125b2b', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', 'info_dict': { 'id': '1962', 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'ext': 'mp4', 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', 'thumbnail': r're:^https?://.*\.jpg', 'view_count': int, 'like_count': int, 'age_limit': 18, }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id or video_id) title = self._html_search_regex( [r']+class=["\']video-name["\'][^>]*>([^<]+)', r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) info = {} if not sources: entries = self._parse_html5_media_entries(url, webpage, video_id) if entries: info = entries[0] if not sources and not info: message = self._html_search_regex( r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, 'thumbnail', default=None, group='url') like_count = int_or_none(self._search_regex( (r'(\d+)
\s*likes', r'(\d+)\s*]+>(?: |\s)*\blikes', r'class=["\']save-count["\'][^>]*>\s*(\d+)'), webpage, 'like count', fatal=False)) return merge_dicts(info, { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, 'like_count': like_count, 'formats': formats, 'age_limit': 18, }) ================================================ FILE: youtube_dl/extractor/pornhub.py ================================================ # coding: utf-8 from __future__ import unicode_literals import functools import itertools import operator import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, compat_urllib_request, ) from .openload import PhantomJSwrapper from ..utils import ( determine_ext, ExtractorError, int_or_none, merge_dicts, NO_DEFAULT, orderedSet, remove_quotes, str_to_int, update_url_query, urlencode_postdata, url_or_none, ) class PornHubBaseIE(InfoExtractor): _NETRC_MACHINE = 'pornhub' _PORNHUB_HOST_RE = r'(?:(?Ppornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) ret = dl(*args, **kwargs) if not ret: return ret webpage, urlh = ret if any(re.search(p, webpage) for p in ( r']+\bonload=["\']go\(\)', r'document\.cookie\s*=\s*["\']RNKEY=', r'document\.location\.reload\(true\)')): url_or_request = args[0] url = (url_or_request.get_full_url() if isinstance(url_or_request, compat_urllib_request.Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) webpage, urlh = dl(*args, **kwargs) return webpage, urlh def _real_initialize(self): self._logged_in = False def _login(self, host): if self._logged_in: return site = host.split('.')[0] # Both sites pornhub and pornhubpremium have separate accounts # so there should be an option to provide credentials for both. # At the same time some videos are available under the same video id # on both sites so that we have to identify them as the same video. # For that purpose we have to keep both in the same extractor # but under different netrc machines. username, password = self._get_login_info(netrc_machine=site) if username is None: return login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') login_page = self._download_webpage( login_url, None, 'Downloading %s login page' % site) def is_logged(webpage): return any(re.search(p, webpage) for p in ( r'class=["\']signOut', r'>Sign\s+[Oo]ut\s*<')) if is_logged(login_page): self._logged_in = True return login_form = self._hidden_inputs(login_page) login_form.update({ 'username': username, 'password': password, }) response = self._download_json( 'https://www.%s/front/authenticate' % host, None, 'Logging in to %s' % site, data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': login_url, 'X-Requested-With': 'XMLHttpRequest', }) if response.get('success') == '1': self._logged_in = True return message = response.get('message') if message is not None: raise ExtractorError( 'Unable to login: %s' % message, expected=True) raise ExtractorError('Unable to log in') class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' _VALID_URL = r'''(?x) https?:// (?: (?:[^/]+\.)? %s /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) ''' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', 'info_dict': { 'id': '648719015', 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', 'upload_date': '20130628', 'timestamp': 1372447216, 'duration': 361, 'view_count': int, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 18, 'tags': list, 'categories': list, }, }, { # non-ASCII title 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002', 'info_dict': { 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', 'upload_date': '20150213', 'timestamp': 1423804862, 'duration': 1753, 'view_count': int, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 18, 'tags': list, 'categories': list, }, 'params': { 'skip_download': True, }, 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', 'info_dict': { 'id': 'ph5af5fef7c2aa7', 'ext': 'mp4', 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor', 'uploader': 'BFFs', 'duration': 622, 'view_count': int, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 18, 'tags': list, 'categories': list, 'subtitles': { 'en': [{ "ext": 'srt' }] }, }, 'params': { 'skip_download': True, }, 'skip': 'This video has been disabled', }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, }, { # removed at the request of the copyright owner 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', 'only_matching': True, }, { # removed by uploader 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', 'only_matching': True, }, { # private video 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', 'only_matching': True, }, { 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 'only_matching': True, }, { 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', 'only_matching': True, }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, }, { 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', 'only_matching': True, }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, }, { # Some videos are available with the same id on both premium # and non-premium sites (e.g. this and the following test) 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', 'only_matching': True, }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', 'only_matching': True, }, { # geo restricted 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156', 'only_matching': True, }, { 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') self._login(host) self._set_cookie(host, 'age_verified', '1') def dl_webpage(platform): self._set_cookie(host, 'platform', platform) return self._download_webpage( 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') error_msg = self._html_search_regex( (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) if any(re.search(p, webpage) for p in ( r'class=["\']geoBlocked["\']', r'>\s*This content is unavailable in your country')): self.raise_geo_restricted() # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._html_search_regex( (r'(?s)]+class=["\']title["\'][^>]*>(?P.+?)</h1>', r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1', r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'), webpage, 'title', group='title') video_urls = [] video_urls_set = set() subtitles = {} flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) if subtitle_url: subtitles.setdefault('en', []).append({ 'url': subtitle_url, 'ext': 'srt', }) thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) media_definitions = flashvars.get('mediaDefinitions') if isinstance(media_definitions, list): for definition in media_definitions: if not isinstance(definition, dict): continue video_url = definition.get('videoUrl') if not video_url or not isinstance(video_url, compat_str): continue if video_url in video_urls_set: continue video_urls_set.add(video_url) video_urls.append( (video_url, int_or_none(definition.get('quality')))) else: thumbnail, duration = [None] * 2 def extract_js_vars(webpage, pattern, default=NO_DEFAULT): assignments = self._search_regex( pattern, webpage, 'encoded url', default=default) if not assignments: return {} assignments = assignments.split(';') js_vars = {} def parse_js_value(inp): inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) if '+' in inp: inps = inp.split('+') return functools.reduce( operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] return remove_quotes(inp) for assn in assignments: assn = assn.strip() if not assn: continue assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) return js_vars def add_video_url(video_url): v_url = url_or_none(video_url) if not v_url: return if v_url in video_urls_set: return video_urls.append((v_url, None)) video_urls_set.add(v_url) def parse_quality_items(quality_items): q_items = self._parse_json(quality_items, video_id, fatal=False) if not isinstance(q_items, list): return for item in q_items: if isinstance(item, dict): add_video_url(item.get('url')) if not video_urls: FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): if key.startswith(FORMAT_PREFIXES[-1]): parse_quality_items(format_url) elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( 'Video %s is locked' % video_id, expected=True) if not video_urls: js_vars = extract_js_vars( dl_webpage('tv'), r'(var.+?mediastring.+?)</script>') add_video_url(js_vars['mediastring']) for mobj in re.finditer( r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): video_url = mobj.group('url') if video_url not in video_urls_set: video_urls.append((video_url, None)) video_urls_set.add(video_url) upload_date = None formats = [] def add_format(format_url, height=None): ext = determine_ext(format_url) if ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash', fatal=False)) return if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) return if not height: height = int_or_none(self._search_regex( r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height', default=None)) formats.append({ 'url': format_url, 'format_id': '%dp' % height if height else None, 'height': height, }) for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') if '/video/get_media' in video_url: medias = self._download_json(video_url, video_id, fatal=False) if isinstance(medias, list): for media in medias: if not isinstance(media, dict): continue video_url = url_or_none(media.get('videoUrl')) if not video_url: continue height = int_or_none(media.get('quality')) add_format(video_url, height) continue add_format(video_url) self._sort_formats( formats, field_preference=('height', 'width', 'fps', 'format_id')) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', default=None) def extract_vote_count(kind, name): return self._extract_count( (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind, r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), webpage, name) view_count = self._extract_count( r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view') like_count = extract_vote_count('Up', 'like') dislike_count = extract_vote_count('Down', 'dislike') comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') def extract_list(meta_key): div = self._search_regex( r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>' % meta_key, webpage, meta_key, default=None) if div: return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div) info = self._search_json_ld(webpage, video_id, default={}) # description provided in JSON-LD is irrelevant info['description'] = None return merge_dicts({ 'id': video_id, 'uploader': video_uploader, 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, 'tags': extract_list('tags'), 'categories': extract_list('categories'), 'subtitles': subtitles, }, info) class PornHubPlaylistBaseIE(PornHubBaseIE): def _extract_page(self, url): return int_or_none(self._search_regex( r'\bpage=(\d+)', url, 'page', default=None)) def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/ytdl-org/youtube-dl/issues/11594). container = self._search_regex( r'(?s)(<div[^>]+class=["\']container.+)', webpage, 'container', default=webpage) return [ self.url_result( 'http://www.%s/%s' % (host, video_url), PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', container)) ] class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, }, { 'url': 'https://www.pornhub.com/pornstar/liz-vicious', 'info_dict': { 'id': 'liz-vicious', }, 'playlist_mincount': 118, }, { 'url': 'https://www.pornhub.com/users/russianveet69', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/channels/povd', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, }, { # Unavailable via /videos page, but available with direct pagination # on pornstar page (see [1]), requires premium # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', 'only_matching': True, }, { # Same as before, multi page 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', 'only_matching': True, }, { 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') videos_url = '%s/videos' % mobj.group('url') page = self._extract_page(url) if page: videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @staticmethod def _has_more(webpage): return re.search( r'''(?x) <li[^>]+\bclass=["\']page_next| <link[^>]+\brel=["\']next| <button[^>]+\bid=["\']moreDataBtn ''', webpage) is not None def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' def download_page(base_url, num, fallback=False): note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 base_url = url has_page = page is not None first_page = page if has_page else 1 for page_num in (first_page, ) if has_page else itertools.count(first_page): try: try: webpage = download_page(base_url, page_num) except ExtractorError as e: # Some sources may not be available via /videos page, # trying to fallback to main page pagination (see [1]) # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) if not page_entries: break for e in page_entries: yield e if not self._has_more(webpage): break def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') item_id = mobj.group('id') self._login(host) return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, }, { 'url': 'http://www.pornhub.com/users/rushandlia/videos', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos', 'info_dict': { 'id': 'pornstar/jenny-blighe/videos', }, 'playlist_mincount': 149, }, { 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3', 'info_dict': { 'id': 'pornstar/jenny-blighe/videos', }, 'playlist_mincount': 40, }, { # default sorting as Top Rated Videos 'url': 'https://www.pornhub.com/channels/povd/videos', 'info_dict': { 'id': 'channels/povd/videos', }, 'playlist_mincount': 293, }, { # Top Rated Videos 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', 'only_matching': True, }, { # Most Recent Videos 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', 'only_matching': True, }, { # Most Viewed Videos 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', 'only_matching': True, }, { 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'only_matching': True, }, { # Most Viewed Videos 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv', 'only_matching': True, }, { # Top Rated Videos 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr', 'only_matching': True, }, { # Longest Videos 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg', 'only_matching': True, }, { # Newest Videos 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/video', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/video?page=3', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/video/search?search=123', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/categories/teen', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/categories/teen?page=3', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/hd', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/hd?page=3', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/described-video', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/described-video?page=2', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn', 'only_matching': True, }, { 'url': 'https://www.pornhub.com/playlist/44121572', 'info_dict': { 'id': 'playlist/44121572', }, 'playlist_mincount': 132, }, { 'url': 'https://www.pornhub.com/playlist/4667351', 'only_matching': True, }, { 'url': 'https://de.pornhub.com/playlist/4667351', 'only_matching': True, }, { 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos', 'only_matching': True, }] @classmethod def suitable(cls, url): return (False if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) else super(PornHubPagedVideoListIE, cls).suitable(url)) class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { 'id': 'jenny-blighe', }, 'playlist_mincount': 129, }, { 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'only_matching': True, }, { 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload', 'only_matching': True, }] ================================================ FILE: youtube_dl/extractor/pornotube.py ================================================ from __future__ import unicode_literals import json from .common import InfoExtractor from ..utils import int_or_none class PornotubeIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science', 'md5': '60fc5a4f0d93a97968fc7999d98260c9', 'info_dict': { 'id': '4964', 'ext': 'mp4', 'upload_date': '20141203', 'title': 'Weird Hot and Wet Science', 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0', 'categories': ['Adult Humor', 'Blondes'], 'uploader': 'Alpha Blue Archives', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1417582800, 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) token = self._download_json( 'https://api.aebn.net/auth/v2/origins/authenticate', video_id, note='Downloading token', data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), headers={ 'Content-Type': 'application/json', 'Origin': 'http://www.pornotube.com', })['tokenKey'] video_url = self._download_json( 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, video_id, note='Downloading delivery information', headers={'Authorization': token})['mediaUrl'] FIELDS = ( 'title', 'description', 'startSecond', 'endSecond', 'publishDate', 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' ) info = self._download_json( 'https://api.aebn.net/content/v2/clips/%s?fields=%s' % (video_id, ','.join(FIELDS)), video_id, note='Downloading metadata', headers={'Authorization': token}) if isinstance(info, list): info = info[0] title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') movie_id = info.get('movieId') primary_image_number = info.get('primaryImageNumber') thumbnail = None if movie_id and primary_image_number: thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( movie_id, movie_id, primary_image_number) start = int_or_none(info.get('startSecond')) end = int_or_none(info.get('endSecond')) duration = end - start if start and end else None categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, 'title': title, 'description': info.get('description'), 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, 'categories': categories, 'age_limit': 18, } ================================================ FILE: youtube_dl/extractor/pornovoisines.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( int_or_none, float_or_none, unified_strdate, ) class PornoVoisinesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)' _TEST = { 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', 'md5': '6f8aca6a058592ab49fe701c8ba8317b', 'info_dict': { 'id': '919', 'display_id': 'recherche-appartement', 'ext': 'mp4', 'title': 'Recherche appartement', 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140925', 'duration': 120, 'view_count': int, 'average_rating': float, 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, 'subtitles': { 'fr': [{ 'ext': 'vtt', }] }, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') settings_url = self._download_json( 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, video_id, note='Getting settings URL')['video_settings_url'] settings = self._download_json(settings_url, video_id)['data'] formats = [] for kind, data in settings['variants'].items(): if kind == 'HLS': formats.extend(self._extract_m3u8_formats( data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) elif kind == 'MP4': for item in data: formats.append({ 'url': item['url'], 'height': item.get('height'), 'bitrate': item.get('bitrate'), }) self._sort_formats(formats) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) description = self._og_search_description(webpage) # The webpage has a bug - there's no space between "thumb" and src= thumbnail = self._html_search_regex( r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2', webpage, 'thumbnail', fatal=False, group='url') upload_date = unified_strdate(self._search_regex( r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False)) duration = settings.get('main', {}).get('duration') view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) categories = self._html_search_regex( r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False) if categories: categories = [category.strip() for category in categories.split(',')] subtitles = {'fr': [{ 'url': subtitle, } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} return { 'id': video_id, 'display_id': display_id, 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'average_rating': average_rating, 'categories': categories, 'age_limit': 18, 'subtitles': subtitles, } ================================================ FILE: youtube_dl/extractor/pornoxo.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( str_to_int, ) class PornoXOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', 'md5': '582f28ecbaa9e6e24cb90f50f524ce87', 'info_dict': { 'id': '7564', 'ext': 'flv', 'title': 'Striptease From Sexy Secretary!', 'display_id': 'striptease-from-sexy-secretary', 'description': 'md5:0ee35252b685b3883f4a1d38332f9980', 'categories': list, # NSFW 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id, display_id = mobj.groups() webpage = self._download_webpage(url, video_id) video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False) title = self._html_search_regex( r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title') view_count = str_to_int(self._html_search_regex( r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False)) categories_str = self._html_search_regex( r'<meta name="description" content=".*featuring\s*([^"]+)"', webpage, 'categories', fatal=False) categories = ( None if categories_str is None else categories_str.split(',')) video_data.update({ 'id': video_id, 'title': title, 'display_id': display_id, 'description': self._html_search_meta('description', webpage), 'categories': categories, 'view_count': view_count, 'age_limit': 18, }) return video_data ================================================ FILE: youtube_dl/extractor/pr0gramm.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor import re from ..utils import ( merge_dicts, ) class Pr0grammStaticIE(InfoExtractor): # Possible urls: # https://pr0gramm.com/static/5466437 _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' _TEST = { 'url': 'https://pr0gramm.com/static/5466437', 'md5': '52fa540d70d3edc286846f8ca85938aa', 'info_dict': { 'id': '5466437', 'ext': 'mp4', 'title': 'pr0gramm-5466437 by g11st', 'uploader': 'g11st', 'upload_date': '20221221', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) # Fetch media sources entries = self._parse_html5_media_entries(url, webpage, video_id) media_info = entries[0] # this raises if there are no formats self._sort_formats(media_info.get('formats') or []) # Fetch author uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') # Fetch approx upload timestamp from filename # Have None-defaults in case the extraction fails uploadDay = None uploadMon = None uploadYear = None uploadTimestr = None # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) if (m): # Up to a day of accuracy should suffice... uploadDay = m.groupdict().get('day') uploadMon = m.groupdict().get('mon') uploadYear = m.groupdict().get('year') uploadTimestr = uploadYear + uploadMon + uploadDay return merge_dicts({ 'id': video_id, 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), 'uploader': uploader, 'upload_date': uploadTimestr }, media_info) # This extractor is for the primary url (used for sharing, and appears in the # location bar) Since this page loads the DOM via JS, yt-dl can't find any # video information here. So let's redirect to a compatibility version of # the site, which does contain the <video>-element by itself, without requiring # js to be ran. class Pr0grammIE(InfoExtractor): # Possible urls: # https://pr0gramm.com/new/546637 # https://pr0gramm.com/new/video/546637 # https://pr0gramm.com/top/546637 # https://pr0gramm.com/top/video/546637 # https://pr0gramm.com/user/g11st/uploads/5466437 # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290 # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030 # https://pr0gramm.com/user/froschler/1elf/5232030 # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id! # https://pr0gramm.com/top/fruher war alles damals/5498175 _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' _TEST = { 'url': 'https://pr0gramm.com/new/video/5466437', 'info_dict': { 'id': '5466437', 'ext': 'mp4', 'title': 'pr0gramm-5466437 by g11st', 'uploader': 'g11st', 'upload_date': '20221221', } } def _generic_title(): return "oof" def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( 'https://pr0gramm.com/static/' + video_id, video_id=video_id, ie=Pr0grammStaticIE.ie_key()) ================================================ FILE: youtube_dl/extractor/presstv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import remove_start class PressTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?' _TEST = { 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', 'info_dict': { 'id': '459911', 'display_id': 'Australian-sewerage-treatment-facility-', 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) # extract video URL from webpage video_url = self._hidden_inputs(webpage)['inpPlayback'] # build list of available formats # specified in http://www.presstv.ir/Scripts/playback.js base_url = 'http://192.99.219.222:82/presstv' _formats = [ (180, '_low200.mp4'), (360, '_low400.mp4'), (720, '_low800.mp4'), (1080, '.mp4') ] formats = [{ 'url': base_url + video_url[:-4] + extension, 'format_id': '%dp' % height, 'height': height, } for height, extension in _formats] # extract video metadata title = remove_start( self._html_search_meta('title', webpage, fatal=True), 'PressTV-') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) upload_date = '%04d%02d%02d' % ( int(mobj.group('y')), int(mobj.group('m')), int(mobj.group('d')), ) return { 'id': video_id, 'display_id': display_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'upload_date': upload_date, 'description': description } ================================================ FILE: youtube_dl/extractor/prosiebensat1.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from hashlib import sha1 from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, float_or_none, int_or_none, merge_dicts, unified_strdate, ) class ProSiebenSat1BaseIE(InfoExtractor): _GEO_BYPASS = False _ACCESS_ID = None _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' def _extract_video_info(self, url, clip_id): client_location = url video = self._download_json( 'http://vas.sim-technik.de/vas/live/v2/videos', clip_id, 'Downloading videos JSON', query={ 'access_token': self._TOKEN, 'client_location': client_location, 'client_name': self._CLIENT_NAME, 'ids': clip_id, })[0] if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) formats = [] if self._ACCESS_ID: raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID protocols = self._download_json( self._V4_BASE_URL + 'protocols', clip_id, 'Downloading protocols JSON', headers=self.geo_verification_headers(), query={ 'access_id': self._ACCESS_ID, 'client_token': sha1((raw_ct).encode()).hexdigest(), 'video_id': clip_id, }, fatal=False, expected_status=(403,)) or {} error = protocols.get('error') or {} if error.get('title') == 'Geo check failed': self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) server_token = protocols.get('server_token') if server_token: urls = (self._download_json( self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ 'access_id': self._ACCESS_ID, 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), 'protocols': self._SUPPORTED_PROTOCOLS, 'server_token': server_token, 'video_id': clip_id, }, fatal=False) or {}).get('urls') or {} for protocol, variant in urls.items(): source_url = variant.get('clear', {}).get('url') if not source_url: continue if protocol == 'dash': formats.extend(self._extract_mpd_formats( source_url, clip_id, mpd_id=protocol, fatal=False)) elif protocol == 'hls': formats.extend(self._extract_m3u8_formats( source_url, clip_id, 'mp4', 'm3u8_native', m3u8_id=protocol, fatal=False)) else: formats.append({ 'url': source_url, 'format_id': protocol, }) if not formats: source_ids = [compat_str(source['id']) for source in video['sources']] client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() sources = self._download_json( 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, clip_id, 'Downloading sources JSON', query={ 'access_token': self._TOKEN, 'client_id': client_id, 'client_location': client_location, 'client_name': self._CLIENT_NAME, }) server_id = sources['server_id'] def fix_bitrate(bitrate): bitrate = int_or_none(bitrate) if not bitrate: return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate for source_id in source_ids: client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() urls = self._download_json( 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, clip_id, 'Downloading urls JSON', fatal=False, query={ 'access_token': self._TOKEN, 'client_id': client_id, 'client_location': client_location, 'client_name': self._CLIENT_NAME, 'server_id': server_id, 'source_ids': source_id, }) if not urls: continue if urls.get('status_code') != 0: raise ExtractorError('This video is unavailable', expected=True) urls_sources = urls['sources'] if isinstance(urls_sources, dict): urls_sources = urls_sources.values() for source in urls_sources: source_url = source.get('url') if not source_url: continue protocol = source.get('protocol') mimetype = source.get('mimetype') if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': formats.extend(self._extract_f4m_formats( source_url, clip_id, f4m_id='hds', fatal=False)) elif mimetype == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats( source_url, clip_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif mimetype == 'application/dash+xml': formats.extend(self._extract_mpd_formats( source_url, clip_id, mpd_id='dash', fatal=False)) else: tbr = fix_bitrate(source['bitrate']) if protocol in ('rtmp', 'rtmpe'): mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) if not mobj: continue path = mobj.group('path') mp4colon_index = path.rfind('mp4:') app = path[:mp4colon_index] play_path = path[mp4colon_index:] formats.append({ 'url': '%s/%s' % (mobj.group('url'), app), 'app': app, 'play_path': play_path, 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', 'page_url': 'http://www.prosieben.de', 'tbr': tbr, 'ext': 'flv', 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), }) else: formats.append({ 'url': source_url, 'tbr': tbr, 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), }) self._sort_formats(formats) return { 'duration': float_or_none(video.get('duration')), 'formats': formats, } class ProSiebenSat1IE(ProSiebenSat1BaseIE): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' _VALID_URL = r'''(?x) https?:// (?:www\.)? (?: (?:beta\.)? (?: prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia )\.(?:de|at|ch)| ran\.de|fem\.com|advopedia\.de|galileo\.tv/video ) /(?P<id>.+) ''' _TESTS = [ { # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242 # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215: # - malformed f4m manifest support # - proper handling of URLs starting with `https?://` in 2.0 manifests # - recursive child f4m manifests extraction 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', 'ext': 'mp4', 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, 'series': 'CIRCUS HALLIGALLI', 'season_number': 2, 'episode': 'Episode 18 - Staffel 2', 'episode_number': 18, }, }, { 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', 'info_dict': { 'id': '2570327', 'ext': 'mp4', 'title': 'Lady-Umstyling für Audrina', 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d', 'upload_date': '20131014', 'duration': 606.76, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'Seems to be broken', }, { 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge', 'info_dict': { 'id': '2429369', 'ext': 'mp4', 'title': 'Countdown für die Autowerkstatt', 'description': 'md5:809fc051a457b5d8666013bc40698817', 'upload_date': '20140223', 'duration': 2595.04, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', 'info_dict': { 'id': '2904997', 'ext': 'mp4', 'title': 'Sexy laufen in Ugg Boots', 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6', 'upload_date': '20140122', 'duration': 245.32, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', 'info_dict': { 'id': '2906572', 'ext': 'mp4', 'title': 'Im Interview: Kai Wiesinger', 'description': 'md5:e4e5370652ec63b95023e914190b4eb9', 'upload_date': '20140203', 'duration': 522.56, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', 'info_dict': { 'id': '2992323', 'ext': 'mp4', 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2', 'description': 'md5:2669cde3febe9bce13904f701e774eb6', 'upload_date': '20141014', 'duration': 2410.44, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', 'info_dict': { 'id': '3004256', 'ext': 'mp4', 'title': 'Schalke: Tönnies möchte Raul zurück', 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f', 'upload_date': '20140226', 'duration': 228.96, }, 'params': { # rtmp download 'skip_download': True, }, 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', 'ext': 'mp4', 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'timestamp': 1382041620, 'upload_date': '20131017', 'duration': 469.88, }, 'params': { 'skip_download': True, }, }, { 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', 'info_dict': { 'id': '2156342', 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, }, 'params': { 'skip_download': True, }, }, { 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist', 'info_dict': { 'id': '439664', 'title': 'Episode 8 - Ganze Folge - Playlist', 'description': 'md5:63b8963e71f481782aeea877658dec84', }, 'playlist_count': 2, 'skip': 'This video is unavailable', }, { # title in <h2 class="subtitle"> 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', 'info_dict': { 'id': '4895826', 'ext': 'mp4', 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', 'upload_date': '20170302', }, 'params': { 'skip_download': True, }, 'skip': 'geo restricted to Germany', }, { # geo restricted to Germany 'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge', 'only_matching': True, }, { # geo restricted to Germany 'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge', 'only_matching': True, }, { # geo restricted to Germany 'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden', 'only_matching': True, }, { 'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel', 'only_matching': True, }, { 'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage', 'only_matching': True, }, ] _TOKEN = 'prosieben' _SALT = '01!8d8F_)r9]4s[qeuXfP%' _CLIENT_NAME = 'kolibri-2.0.19-splec4' _ACCESS_ID = 'x_prosiebenmaxx-de' _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag' _IV = 'Aeluchoc6aevechuipiexeeboowedaok' _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', r'clip[iI]d=(\d+)', r'clip[iI][dD]\s*=\s*["\'](\d+)', r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)", r'proMamsId"\s*:\s*"(\d+)', r'proMamsId"\s*:\s*"(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', r'<header class="clearfix">\s*<h3>(.+?)</h3>', r'<!-- start video -->\s*<h1>(.+?)</h1>', r'<h1 class="att-name">\s*(.+?)</h1>', r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>', r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>', r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', r'<p class="att-description">\s*(.+?)\s*</p>', r'<p class="video-description" itemprop="description">\s*(.+?)</p>', r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>', ] _UPLOAD_DATE_REGEXES = [ r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"', r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>', r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>', ] _PAGE_TYPE_REGEXES = [ r'<meta name="page_type" content="([^"]+)">', r"'itemType'\s*:\s*'([^']*)'", ] _PLAYLIST_ID_REGEXES = [ r'content[iI]d=(\d+)', r"'itemId'\s*:\s*'([^']*)'", ] _PLAYLIST_CLIP_REGEXES = [ r'(?s)data-qvt=.+?<a href="([^"]+)"', ] def _extract_clip(self, url, webpage): clip_id = self._html_search_regex( self._CLIPID_REGEXES, webpage, 'clip id') title = self._html_search_regex( self._TITLE_REGEXES, webpage, 'title', default=None) or self._og_search_title(webpage) info = self._extract_video_info(url, clip_id) description = self._html_search_regex( self._DESCRIPTION_REGEXES, webpage, 'description', default=None) if description is None: description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate( self._html_search_meta('og:published_time', webpage, 'upload date', default=None) or self._html_search_regex(self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) json_ld = self._search_json_ld(webpage, clip_id, default={}) return merge_dicts(info, { 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, }, json_ld) def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( self._PLAYLIST_ID_REGEXES, webpage, 'playlist id') playlist = self._parse_json( self._search_regex( r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script', webpage, 'playlist'), playlist_id) entries = [] for item in playlist: clip_id = item.get('id') or item.get('upc') if not clip_id: continue info = self._extract_video_info(url, clip_id) info.update({ 'id': clip_id, 'title': item.get('title') or item.get('teaser', {}).get('headline'), 'description': item.get('teaser', {}).get('description'), 'thumbnail': item.get('poster'), 'duration': float_or_none(item.get('duration')), 'series': item.get('tvShowTitle'), 'uploader': item.get('broadcastPublisher'), }) entries.append(info) return self.playlist_result(entries, playlist_id) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) page_type = self._search_regex( self._PAGE_TYPE_REGEXES, webpage, 'page type', default='clip').lower() if page_type == 'clip': return self._extract_clip(url, webpage) elif page_type == 'playlist': return self._extract_playlist(url, webpage) else: raise ExtractorError( 'Unsupported page type %s' % page_type, expected=True) ================================================ FILE: youtube_dl/extractor/puhutv.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( ExtractorError, int_or_none, float_or_none, parse_resolution, str_or_none, try_get, unified_timestamp, url_or_none, urljoin, ) class PuhuTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle' IE_NAME = 'puhutv' _TESTS = [{ # film 'url': 'https://puhutv.com/sut-kardesler-izle', 'md5': 'a347470371d56e1585d1b2c8dab01c96', 'info_dict': { 'id': '5085', 'display_id': 'sut-kardesler', 'ext': 'mp4', 'title': 'Süt Kardeşler', 'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 4832.44, 'creator': 'Arzu Film', 'timestamp': 1561062602, 'upload_date': '20190620', 'release_year': 1976, 'view_count': int, 'tags': list, }, }, { # episode, geo restricted, bypassable with --geo-verification-proxy 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle', 'only_matching': True, }, { # 4k, with subtitles 'url': 'https://puhutv.com/dip-1-bolum-izle', 'only_matching': True, }] _SUBTITLE_LANGS = { 'English': 'en', 'Deutsch': 'de', 'عربى': 'ar' } def _real_extract(self, url): display_id = self._match_id(url) info = self._download_json( urljoin(url, '/api/slug/%s-izle' % display_id), display_id)['data'] video_id = compat_str(info['id']) show = info.get('title') or {} title = info.get('name') or show['name'] if info.get('display_name'): title = '%s %s' % (title, info['display_name']) try: videos = self._download_json( 'https://puhutv.com/api/assets/%s/videos' % video_id, display_id, 'Downloading video JSON', headers=self.geo_verification_headers()) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: self.raise_geo_restricted() raise urls = [] formats = [] for video in videos['data']['videos']: media_url = url_or_none(video.get('url')) if not media_url or media_url in urls: continue urls.append(media_url) playlist = video.get('is_playlist') if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) continue quality = int_or_none(video.get('quality')) f = { 'url': media_url, 'ext': 'mp4', 'height': quality } video_format = video.get('video_format') is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False if is_hls: format_id = 'hls' f['protocol'] = 'm3u8_native' elif video_format == 'mp4': format_id = 'http' else: continue if quality: format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) self._sort_formats(formats) creator = try_get( show, lambda x: x['producer']['name'], compat_str) content = info.get('content') or {} images = try_get( content, lambda x: x['images']['wide'], dict) or {} thumbnails = [] for image_id, image_url in images.items(): if not isinstance(image_url, compat_str): continue if not image_url.startswith(('http', '//')): image_url = 'https://%s' % image_url t = parse_resolution(image_id) t.update({ 'id': image_id, 'url': image_url }) thumbnails.append(t) tags = [] for genre in show.get('genres') or []: if not isinstance(genre, dict): continue genre_name = genre.get('name') if genre_name and isinstance(genre_name, compat_str): tags.append(genre_name) subtitles = {} for subtitle in content.get('subtitles') or []: if not isinstance(subtitle, dict): continue lang = subtitle.get('language') sub_url = url_or_none(subtitle.get('url') or subtitle.get('file')) if not lang or not isinstance(lang, compat_str) or not sub_url: continue subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ 'url': sub_url }] return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': info.get('description') or show.get('description'), 'season_id': str_or_none(info.get('season_id')), 'season_number': int_or_none(info.get('season_number')), 'episode_number': int_or_none(info.get('episode_number')), 'release_year': int_or_none(show.get('released_at')), 'timestamp': unified_timestamp(info.get('created_at')), 'creator': creator, 'view_count': int_or_none(content.get('watch_count')), 'duration': float_or_none(content.get('duration_in_ms'), 1000), 'tags': tags, 'subtitles': subtitles, 'thumbnails': thumbnails, 'formats': formats } class PuhuTVSerieIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay' IE_NAME = 'puhutv:serie' _TESTS = [{ 'url': 'https://puhutv.com/deniz-yildizi-detay', 'info_dict': { 'title': 'Deniz Yıldızı', 'id': 'deniz-yildizi', }, 'playlist_mincount': 205, }, { # a film detail page which is using same url with serie page 'url': 'https://puhutv.com/kaybedenler-kulubu-detay', 'only_matching': True, }] def _extract_entries(self, seasons): for season in seasons: season_id = season.get('id') if not season_id: continue page = 1 has_more = True while has_more is True: season = self._download_json( 'https://galadriel.puhutv.com/seasons/%s' % season_id, season_id, 'Downloading page %s' % page, query={ 'page': page, 'per': 40, }) episodes = season.get('episodes') if isinstance(episodes, list): for ep in episodes: slug_path = str_or_none(ep.get('slugPath')) if not slug_path: continue video_id = str_or_none(int_or_none(ep.get('id'))) yield self.url_result( 'https://puhutv.com/%s' % slug_path, ie=PuhuTVIE.ie_key(), video_id=video_id, video_title=ep.get('name') or ep.get('eventLabel')) page += 1 has_more = season.get('hasMore') def _real_extract(self, url): playlist_id = self._match_id(url) info = self._download_json( urljoin(url, '/api/slug/%s-detay' % playlist_id), playlist_id)['data'] seasons = info.get('seasons') if seasons: return self.playlist_result( self._extract_entries(seasons), playlist_id, info.get('name')) # For films, these are using same url with series video_id = info.get('slug') or info['assets'][0]['slug'] return self.url_result( 'https://puhutv.com/%s-izle' % video_id, PuhuTVIE.ie_key(), video_id) ================================================ FILE: youtube_dl/extractor/puls4.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .prosiebensat1 import ProSiebenSat1BaseIE from ..utils import ( unified_strdate, parse_duration, compat_str, ) class Puls4IE(ProSiebenSat1BaseIE): _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)' _TESTS = [{ 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', 'info_dict': { 'id': '118118', 'ext': 'flv', 'title': 'Tobias Homberger von myclubs im #2min2miotalk', 'description': 'md5:f9def7c5e8745d6026d8885487d91955', 'upload_date': '20160830', 'uploader': 'PULS_4', }, }, { 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer', 'only_matching': True, }, { 'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598', 'only_matching': True, }] _TOKEN = 'puls4' _SALT = '01!kaNgaiNgah1Ie4AeSha' _CLIENT_NAME = '' def _real_extract(self, url): path = self._match_id(url) content_path = self._download_json( 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] media = self._download_json( 'http://www.puls4.com' + content_path, content_path)['mediaCurrent'] player_content = media['playerContent'] info = self._extract_video_info(url, player_content['id']) info.update({ 'id': compat_str(media['objectId']), 'title': player_content['title'], 'description': media.get('description'), 'thumbnail': media.get('previewLink'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(player_content.get('duration')), 'episode': player_content.get('episodePartName'), 'show': media.get('channel'), 'season_id': player_content.get('seasonId'), 'uploader': player_content.get('sourceCompany'), }) return info ================================================ FILE: youtube_dl/extractor/pyvideo.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_str from ..utils import int_or_none class PyvideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' _TESTS = [{ 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', 'info_dict': { 'id': 'become-a-logging-expert-in-30-minutes', }, 'playlist_count': 2, }, { 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', 'info_dict': { 'id': '2542', 'ext': 'm4v', 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) category = mobj.group('category') video_id = mobj.group('id') entries = [] data = self._download_json( 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' % (category, video_id), video_id, fatal=False) if data: for video in data['videos']: video_url = video.get('url') if video_url: if video.get('type') == 'youtube': entries.append(self.url_result(video_url, 'Youtube')) else: entries.append({ 'id': compat_str(data.get('id') or video_id), 'url': video_url, 'title': data['title'], 'description': data.get('description') or data.get('summary'), 'thumbnail': data.get('thumbnail_url'), 'duration': int_or_none(data.get('duration')), }) else: webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) media_urls = self._search_regex( r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') for m in re.finditer( r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): media_url = m.group('url') if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): entries.append(self.url_result(media_url, 'Youtube')) else: entries.append({ 'id': video_id, 'url': media_url, 'title': title, }) return self.playlist_result(entries, video_id) ================================================ FILE: youtube_dl/extractor/qqmusic.py ================================================ # coding: utf-8 from __future__ import unicode_literals import random import re import time from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError, strip_jsonp, unescapeHTML, ) class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', 'release_date': '20141227', 'creator': '林俊杰', 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', 'info_dict': { 'id': '004MsGEo3DdNxV', 'ext': 'mp3', 'title': '如果', 'release_date': '20050626', 'creator': '李季美', 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'lyrics not in .lrc format', 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', 'title': 'Shadows Over Transylvania', 'release_date': '19970225', 'creator': 'Dark Funeral', 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, }, }] _FORMATS = { 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} } # Reference: m_r_GetRUin() in top_player.js # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js @staticmethod def m_r_get_ruin(): curMs = int(time.time() * 1000) % 1000 return int(round(random.random() * 2147483647) * curMs % 1E10) def _real_extract(self, url): mid = self._match_id(url) detail_info_page = self._download_webpage( 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, mid, note='Download song detail info', errnote='Unable to get song detail info', encoding='gbk') song_name = self._html_search_regex( r"songname:\s*'([^']+)'", detail_info_page, 'song name') publish_time = self._html_search_regex( r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, 'publish time', default=None) if publish_time: publish_time = publish_time.replace('-', '') singer = self._html_search_regex( r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) lrc_content = self._html_search_regex( r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', detail_info_page, 'LRC lyrics', default=None) if lrc_content: lrc_content = lrc_content.replace('\\n', '\n') thumbnail_url = None albummid = self._search_regex( [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) if albummid: thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ % (albummid[-2:-1], albummid[-1], albummid) guid = self.m_r_get_ruin() vkey = self._download_json( 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, mid, note='Retrieve vkey', errnote='Unable to get vkey', transform_source=strip_jsonp)['key'] formats = [] for format_id, details in self._FORMATS.items(): formats.append({ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' % (details['prefix'], mid, details['ext'], vkey, guid), 'format': format_id, 'format_id': format_id, 'preference': details['preference'], 'abr': details.get('abr'), }) self._check_formats(formats, mid) self._sort_formats(formats) actual_lrc_lyrics = ''.join( line + '\n' for line in re.findall( r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content)) info_dict = { 'id': mid, 'formats': formats, 'title': song_name, 'release_date': publish_time, 'creator': singer, 'description': lrc_content, 'thumbnail': thumbnail_url } if actual_lrc_lyrics: info_dict['subtitles'] = { 'origin': [{ 'ext': 'lrc', 'data': actual_lrc_lyrics, }] } return info_dict class QQPlaylistBaseIE(InfoExtractor): @staticmethod def qq_static_url(category, mid): return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) def get_singer_all_songs(self, singmid, num): return self._download_webpage( r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, query={ 'format': 'json', 'inCharset': 'utf8', 'outCharset': 'utf-8', 'platform': 'yqq', 'needNewCode': 0, 'singermid': singmid, 'order': 'listen', 'begin': 0, 'num': num, 'songstatus': 1, }) def get_entries_from_page(self, singmid): entries = [] default_num = 1 json_text = self.get_singer_all_songs(singmid, default_num) json_obj_all_songs = self._parse_json(json_text, singmid) if json_obj_all_songs['code'] == 0: total = json_obj_all_songs['data']['total'] json_text = self.get_singer_all_songs(singmid, total) json_obj_all_songs = self._parse_json(json_text, singmid) for item in json_obj_all_songs['data']['list']: if item['musicData'].get('songmid') is not None: songmid = item['musicData']['songmid'] entries.append(self.url_result( r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) return entries class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' _TEST = { 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', 'description': 'md5:870ec08f7d8547c29c93010899103751', }, 'playlist_mincount': 12, } def _real_extract(self, url): mid = self._match_id(url) entries = self.get_entries_from_page(mid) singer_page = self._download_webpage(url, mid, 'Download singer page') singer_name = self._html_search_regex( r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) singer_desc = None if mid: singer_desc_page = self._download_xml( 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, 'Donwload singer description XML', query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) singer_desc = singer_desc_page.find('./data/info/desc').text return self.playlist_result(entries, mid, singer_name, singer_desc) class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', 'description': 'md5:179c5dce203a5931970d306aa9607ea6', }, 'playlist_count': 4, }, { 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', 'description': 'md5:a48823755615508a95080e81b51ba729', }, 'playlist_count': 8, }] def _real_extract(self, url): mid = self._match_id(url) album = self._download_json( 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, mid, 'Download album page')['data'] entries = [ self.url_result( 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') album_detail = album.get('desc') if album_detail is not None: album_detail = album_detail.strip() return self.playlist_result(entries, mid, album_name, album_detail) class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { 'id': '123', 'title': '美国iTunes榜', 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', }, 'playlist_count': 100, }, { 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { 'id': '3', 'title': '巅峰榜·欧美', 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', }, 'playlist_count': 100, }, { 'url': 'https://y.qq.com/n/yqq/toplist/106.html', 'info_dict': { 'id': '106', 'title': '韩国Mnet榜', 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, }] def _real_extract(self, url): list_id = self._match_id(url) toplist_json = self._download_json( 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, note='Download toplist page', query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) entries = [self.url_result( 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', song['data']['songmid']) for song in toplist_json['songlist']] topinfo = toplist_json.get('topinfo', {}) list_name = topinfo.get('ListName') list_description = topinfo.get('info') return self.playlist_result(entries, list_id, list_name, list_description) class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' _TESTS = [{ 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', 'info_dict': { 'id': '3462654915', 'title': '韩国5月新歌精选下旬', 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', }, 'playlist_count': 40, 'skip': 'playlist gone', }, { 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', 'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪,就是这样的简单才易入人心。', }, 'playlist_count': 20, }] def _real_extract(self, url): list_id = self._match_id(url) list_json = self._download_json( 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', list_id, 'Download list page', query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, transform_source=strip_jsonp) if not len(list_json.get('cdlist', [])): if list_json.get('code'): raise ExtractorError( 'QQ Music said: error %d in fetching playlist info' % list_json['code'], expected=True) raise ExtractorError('Unable to get playlist info') cdlist = list_json['cdlist'][0] entries = [self.url_result( 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) for song in cdlist['songlist']] list_name = cdlist.get('dissname') list_description = clean_html(unescapeHTML(cdlist.get('desc'))) return self.playlist_result(entries, list_id, list_name, list_description) ================================================ FILE: youtube_dl/extractor/r7.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none class R7IE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P<id>[\da-f]{24}) ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', 'info_dict': { 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', 'description': 'md5:01812008664be76a6479aa58ec865b72', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, 'view_count': int, }, }, { 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', 'only_matching': True, }, { 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', 'only_matching': True, }, { 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'http://player-api.r7.com/video/i/%s' % video_id, video_id) title = video['title'] formats = [] media_url_hls = video.get('media_url_hls') if media_url_hls: formats.extend(self._extract_m3u8_formats( media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) media_url = video.get('media_url') if media_url: f = { 'url': media_url, 'format_id': 'http', } # m3u8 format always matches the http format, let's copy metadata from # one to another m3u8_formats = list(filter( lambda f: f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == 1: f_copy = m3u8_formats[0].copy() f_copy.update(f) f_copy['protocol'] = 'http' f = f_copy formats.append(f) self._sort_formats(formats) description = video.get('description') thumbnail = video.get('thumb') duration = int_or_none(video.get('media_duration')) like_count = int_or_none(video.get('likes')) view_count = int_or_none(video.get('views')) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } class R7ArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' _TEST = { 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', 'only_matching': True, } @classmethod def suitable(cls, url): return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', webpage, 'video id') return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) ================================================ FILE: youtube_dl/extractor/radiobremen.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import parse_duration class RadioBremenIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' IE_NAME = 'radiobremen' _TEST = { 'url': 'http://www.radiobremen.de/mediathek/?id=141876', 'info_dict': { 'id': '141876', 'ext': 'mp4', 'duration': 178, 'width': 512, 'title': 'Druck auf Patrick Öztürk', 'thumbnail': r're:https?://.*\.jpg$', 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', }, } def _real_extract(self, url): video_id = self._match_id(url) meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id meta_doc = self._download_webpage( meta_url, video_id, 'Downloading metadata') title = self._html_search_regex( r'<h1.*>(?P<title>.+)</h1>', meta_doc, 'title') description = self._html_search_regex( r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False) duration = parse_duration(self._html_search_regex( r'Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>', meta_doc, 'duration', fatal=False)) page_doc = self._download_webpage( url, video_id, 'Downloading video information') mobj = re.search( r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", page_doc) video_url = ( "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % (video_id, video_id, mobj.group("secret"), mobj.group('width'))) formats = [{ 'url': video_url, 'ext': 'mp4', 'width': int(mobj.group('width')), }] return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'formats': formats, 'thumbnail': mobj.group('thumbnail'), } ================================================ FILE: youtube_dl/extractor/radiocanada.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( determine_ext, ExtractorError, int_or_none, unified_strdate, ) class RadioCanadaIE(InfoExtractor): IE_NAME = 'radiocanada' _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' _TESTS = [ { 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', 'info_dict': { 'id': '7184272', 'ext': 'mp4', 'title': 'Le parcours du tireur capté sur vidéo', 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', 'upload_date': '20141023', }, 'params': { # m3u8 download 'skip_download': True, } }, { # empty Title 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', 'info_dict': { 'id': '7754998', 'ext': 'mp4', 'title': 'letelejournal22h', 'description': 'INTEGRALE WEB 22H-TJ', 'upload_date': '20170720', }, 'params': { # m3u8 download 'skip_download': True, }, }, { # with protectionType but not actually DRM protected 'url': 'radiocanada:toutv:140872', 'info_dict': { 'id': '140872', 'title': 'Épisode 1', 'series': 'District 31', }, 'only_matching': True, } ] _GEO_COUNTRIES = ['CA'] _access_token = None _claims = None def _call_api(self, path, video_id=None, app_code=None, query=None): if not query: query = {} query.update({ 'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb', 'output': 'json', }) if video_id: query.update({ 'appCode': app_code, 'idMedia': video_id, }) if self._access_token: query['access_token'] = self._access_token try: return self._download_json( 'https://services.radio-canada.ca/media/' + path, video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): data = self._parse_json(e.cause.read().decode(), None) error = data.get('error_description') or data['errorMessage']['text'] raise ExtractorError(error, expected=True) raise def _extract_info(self, app_code, video_id): metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas'] def get_meta(name): for meta in metas: if meta.get('name') == name: text = meta.get('text') if text: return text # protectionType does not necessarily mean the video is DRM protected (see # https://github.com/ytdl-org/youtube-dl/pull/18609). if get_meta('protectionType'): self.report_warning('This video is probably DRM protected.') query = { 'connectionType': 'hd', 'deviceType': 'ipad', 'multibitrate': 'true', } if self._claims: query['claims'] = self._claims v_data = self._call_api('validation/v2/', video_id, app_code, query) v_url = v_data.get('url') if not v_url: error = v_data['message'] if error == "Le contenu sélectionné n'est pas disponible dans votre pays": raise self.raise_geo_restricted(error, self._GEO_COUNTRIES) if error == 'Le contenu sélectionné est disponible seulement en premium': self.raise_login_required(error) raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') self._sort_formats(formats) subtitles = {} closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') if closed_caption_url: subtitles['fr'] = [{ 'url': closed_caption_url, 'ext': determine_ext(closed_caption_url, 'vtt'), }] return { 'id': video_id, 'title': get_meta('Title') or get_meta('AV-nomEmission'), 'description': get_meta('Description') or get_meta('ShortDescription'), 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), 'duration': int_or_none(get_meta('length')), 'series': get_meta('Emission'), 'season_number': int_or_none('SrcSaison'), 'episode_number': int_or_none('SrcEpisode'), 'upload_date': unified_strdate(get_meta('Date')), 'subtitles': subtitles, 'formats': formats, } def _real_extract(self, url): return self._extract_info(*re.match(self._VALID_URL, url).groups()) class RadioCanadaAudioVideoIE(InfoExtractor): IE_NAME = 'radiocanada:audiovideo' _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', 'ext': 'mp4', 'title': 'Barack Obama au Vietnam', 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', 'upload_date': '20160523', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam', 'only_matching': True, }] def _real_extract(self, url): return self.url_result('radiocanada:medianet:%s' % self._match_id(url)) ================================================ FILE: youtube_dl/extractor/radiode.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor class RadioDeIE(InfoExtractor): IE_NAME = 'radio.de' _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)' _TEST = { 'url': 'http://ndr2.radio.de/', 'info_dict': { 'id': 'ndr2', 'ext': 'mp3', 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:591c49c702db1a33751625ebfb67f273', 'thumbnail': r're:^https?://.*\.png', 'is_live': True, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): radio_id = self._match_id(url) webpage = self._download_webpage(url, radio_id) jscode = self._search_regex( r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n", webpage, 'broadcast') broadcast = self._parse_json(jscode, radio_id) title = self._live_title(broadcast['name']) description = broadcast.get('description') or broadcast.get('shortDescription') thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') formats = [{ 'url': stream['streamUrl'], 'ext': stream['streamContentFormat'].lower(), 'acodec': stream['streamContentFormat'], 'abr': stream['bitRate'], 'asr': stream['sampleRate'] } for stream in broadcast['streamUrls']] self._sort_formats(formats) return { 'id': radio_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'is_live': True, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/radiofrance.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class RadioFranceIE(InfoExtractor): _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' IE_NAME = 'radiofrance' _TEST = { 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', 'info_dict': { 'id': 'one-one', 'ext': 'ogg', 'title': 'One to one', 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", 'uploader': 'Thomas Hercouët', }, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') description = self._html_search_regex( r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<div class="credit">  © (.*?)</div>', webpage, 'uploader', fatal=False) formats_str = self._html_search_regex( r'class="jp-jplayer[^"]*" data-source="([^"]+)">', webpage, 'audio URLs') formats = [ { 'format_id': fm[0], 'url': fm[1], 'vcodec': 'none', 'preference': i, } for i, fm in enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'description': description, 'uploader': uploader, } ================================================ FILE: youtube_dl/extractor/radiojavan.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_resolution, str_to_int, unified_strdate, urlencode_postdata, urljoin, ) class RadioJavanIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?' _TEST = { 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam', 'md5': 'e85208ffa3ca8b83534fca9fe19af95b', 'info_dict': { 'id': 'chaartaar-ashoobam', 'ext': 'mp4', 'title': 'Chaartaar - Ashoobam', 'thumbnail': r're:^https?://.*\.jpe?g$', 'upload_date': '20150215', 'view_count': int, 'like_count': int, 'dislike_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) download_host = self._download_json( 'https://www.radiojavan.com/videos/video_host', video_id, data=urlencode_postdata({'id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': url, }).get('host', 'https://host1.rjmusicmedia.com') webpage = self._download_webpage(url, video_id) formats = [] for format_id, _, video_path in re.findall( r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', webpage): f = parse_resolution(format_id) f.update({ 'url': urljoin(download_host, video_path), 'format_id': format_id, }) formats.append(f) self._sort_formats(formats) title = self._og_search_title(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( r'class="date_added">Date added: ([^<]+)<', webpage, 'upload date', fatal=False)) view_count = str_to_int(self._search_regex( r'class="views">Plays: ([\d,]+)', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( r'class="rating">([\d,]+) likes', webpage, 'like count', fatal=False)) dislike_count = str_to_int(self._search_regex( r'class="rating">([\d,]+) dislikes', webpage, 'dislike count', fatal=False)) return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'upload_date': upload_date, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/rai.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urlparse, ) from ..utils import ( determine_ext, ExtractorError, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, HEADRequest, int_or_none, parse_duration, remove_start, strip_or_none, try_get, unified_strdate, unified_timestamp, update_url_query, urljoin, xpath_text, ) class RaiBaseIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id): if not re.match(r'https?://', relinker_url): return {'formats': [{'url': relinker_url}]} formats = [] geoprotection = None is_live = None duration = None for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( relinker_url, video_id, note='Downloading XML metadata for platform %s' % platform, transform_source=fix_xml_ampersands, query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) if not geoprotection: geoprotection = xpath_text( relinker, './geoprotection', default=None) == 'Y' if not is_live: is_live = xpath_text( relinker, './is_live', default=None) == 'Y' if not duration: duration = parse_duration(xpath_text( relinker, './duration', default=None)) url_elem = find_xpath_attr(relinker, './url', 'type', 'content') if url_elem is None: continue media_url = url_elem.text # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): continue if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'f4m' or platform == 'flash': manifest_url = update_url_query( media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) formats.extend(self._extract_f4m_formats( manifest_url, video_id, f4m_id='hds', fatal=False)) else: bitrate = int_or_none(xpath_text(relinker, 'bitrate')) formats.append({ 'url': media_url, 'tbr': bitrate if bitrate > 0 else None, 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) if not formats and geoprotection is True: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) formats.extend(self._create_http_urls(relinker_url, formats)) return dict((k, v) for k, v in { 'is_live': is_live, 'duration': duration, 'formats': formats, }.items() if v is not None) def _create_http_urls(self, relinker_url, fmts): _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h '250': [352, 198], '400': [512, 288], '700': [512, 288], '800': [700, 394], '1200': [736, 414], '1800': [1024, 576], '2400': [1280, 720], '3200': [1440, 810], '3600': [1440, 810], '5000': [1920, 1080], '10000': [1920, 1080], } def test_url(url): resp = self._request_webpage( HEADRequest(url), None, headers={'User-Agent': 'Rai'}, fatal=False, errnote=False, note=False) if resp is False: return False if resp.code == 200: return False if resp.url == url else resp.url return None def get_format_info(tbr): import math br = int_or_none(tbr) if len(fmts) == 1 and not br: br = fmts[0].get('tbr') if br > 300: tbr = compat_str(math.floor(br / 100) * 100) else: tbr = '250' # try extracting info from available m3u8 formats format_copy = None for f in fmts: if f.get('tbr'): br_limit = math.floor(br / 100) if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: format_copy = f.copy() return { 'width': format_copy.get('width'), 'height': format_copy.get('height'), 'tbr': format_copy.get('tbr'), 'vcodec': format_copy.get('vcodec'), 'acodec': format_copy.get('acodec'), 'fps': format_copy.get('fps'), 'format_id': 'https-%s' % tbr, } if format_copy else { 'width': _QUALITY[tbr][0], 'height': _QUALITY[tbr][1], 'format_id': 'https-%s' % tbr, 'tbr': int(tbr), } loc = test_url(_MP4_TMPL % (relinker_url, '*')) if not isinstance(loc, compat_str): return [] mobj = re.match( _RELINKER_REG, test_url(relinker_url) or '') if not mobj: return [] available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] available_qualities = [i for i in available_qualities if i] formats = [] for q in available_qualities: fmt = { 'url': _MP4_TMPL % (relinker_url, q), 'protocol': 'https', 'ext': 'mp4', } fmt.update(get_format_info(q)) formats.append(fmt) return formats @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' SRT_EXT = 'srt' subtitles = {} subtitles_array = video_data.get('subtitlesArray') or [] for k in ('subtitles', 'subtitlesUrl'): subtitles_array.append({'url': video_data.get(k)}) for subtitle in subtitles_array: sub_url = subtitle.get('url') if sub_url and isinstance(sub_url, compat_str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) sub_ext = determine_ext(sub_url, SRT_EXT) subtitles.setdefault(sub_lang, []).append({ 'ext': sub_ext, 'url': sub_url, }) if STL_EXT == sub_ext: subtitles[sub_lang].append({ 'ext': SRT_EXT, 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, }) return subtitles class RaiPlayIE(RaiBaseIE): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', 'season': '2013/14', 'subtitles': { 'it': 'count:2', }, }, 'params': { 'skip_download': True, }, }, { # 1080p direct mp4 url 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', 'md5': '2e501e8651d72f05ffe8f5d286ad560b', 'info_dict': { 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', 'ext': 'mp4', 'title': 'Leonardo - S1E1', 'alt_title': 'St 1 Ep 1 - Episodio 1', 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai 1', 'duration': 3229, 'series': 'Leonardo', 'season': 'Season 1', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, }, { # subtitles at 'subtitlesArray' key (see #27698) 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', 'only_matching': True, }, { # DRM protected 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', 'only_matching': True, }] def _real_extract(self, url): base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( base + '.json', video_id, 'Downloading video JSON') if try_get( media, (lambda x: x['rights_management']['rights']['drm'], lambda x: x['program_info']['rights_management']['rights']['drm']), dict): raise ExtractorError('This video is DRM protected.', expected=True) title = media['name'] video = media['video'] relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] for _, value in media.get('images', {}).items(): if value: thumbnails.append({ 'url': urljoin(url, value), }) date_published = media.get('date_published') time_published = media.get('time_published') if date_published and time_published: date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video) program_info = media.get('program_info') or {} season = media.get('season') info = { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, 'series': program_info.get('name'), 'season_number': int_or_none(season), 'season': season if (season and not season.isdigit()) else None, 'episode': media.get('episode_title'), 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } info.update(relinker_info) return info class RaiPlayLiveIE(RaiPlayIE): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, }, 'params': { 'skip_download': True, }, }] class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): base, playlist_id = re.match(self._VALID_URL, url).groups() program = self._download_json( base + '.json', playlist_id, 'Downloading program JSON') entries = [] for b in (program.get('blocks') or []): for s in (b.get('sets') or []): s_id = s.get('id') if not s_id: continue medias = self._download_json( '%s/%s.json' % (base, s_id), s_id, 'Downloading content set JSON', fatal=False) if not medias: continue for m in (medias.get('items') or []): path_id = m.get('path_id') if not path_id: continue video_url = urljoin(url, path_id) entries.append(self.url_result( video_url, ie=RaiPlayIE.ie_key(), video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result( entries, playlist_id, program.get('name'), try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ # var uniquename = "ContentItem-..." # data-id="ContentItem-..." 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', }, 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 'info_dict': { 'id': '1632c009-c843-4836-bb65-80c33084a64b', 'ext': 'mp4', 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', 'duration': 833, 'upload_date': '20161103', } }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', 'title': 'TG1 ore 20:00 del 03/11/2016', 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2214, 'upload_date': '20161103', } }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', 'info_dict': { 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', 'ext': 'mp4', 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', 'duration': 2274, 'upload_date': '20170401', }, 'skip': 'Changes daily', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', 'info_dict': { 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', 'ext': 'mp4', 'title': 'La diretta di Rainews24', }, 'params': { 'skip_download': True, }, }, { # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', 'info_dict': { 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', 'ext': 'mp4', 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', 'description': 'md5:d291b03407ec505f95f27970c0b025f4', 'upload_date': '20150913', 'subtitles': { 'it': 'count:2', }, }, 'params': { 'skip_download': True, }, }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, }, { 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') title = media['name'].strip() media_type = media['type'] if 'Audio' in media_type: relinker_info = { 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) else: raise ExtractorError('not a media file') self._sort_formats(relinker_info['formats']) thumbnails = [] for image_type in ('image', 'image_medium', 'image_300'): thumbnail_url = media.get(image_type) if thumbnail_url: thumbnails.append({ 'url': compat_urlparse.urljoin(url, thumbnail_url), }) subtitles = self._extract_subtitles(url, media) info = { 'id': content_id, 'title': title, 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), 'subtitles': subtitles, } info.update(relinker_info) return info def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) content_item_id = None content_item_url = self._html_search_meta( ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', 'twitter:player', 'jsonlink'), webpage, default=None) if content_item_url: content_item_id = self._search_regex( r'ContentItem-(%s)' % self._UUID_RE, content_item_url, 'content item id', default=None) if not content_item_id: content_item_id = self._search_regex( r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| <(?:[^>]+\bdata-id|var\s+uniquename)=| <iframe[^>]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P<id>%s) ''' % self._UUID_RE, webpage, 'content item id', default=None, group='id') content_item_ids = set() if content_item_id: content_item_ids.add(content_item_id) if video_id not in content_item_ids: content_item_ids.add(video_id) for content_item_id in content_item_ids: try: return self._extract_from_content_id(content_item_id, url) except GeoRestrictedError: raise except ExtractorError: pass relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| mediaInfo\.mediaUri )\s*=\s* ([\'"]) (?P<url> (?:https?:)? //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) self._sort_formats(relinker_info['formats']) title = self._search_regex( r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', webpage, 'title', group='title', default=None) or self._og_search_title(webpage) info = { 'id': video_id, 'title': title, } info.update(relinker_info) return info ================================================ FILE: youtube_dl/extractor/raywenderlich.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from .vimeo import VimeoIE from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, merge_dicts, try_get, unescapeHTML, unified_timestamp, urljoin, ) class RayWenderlichIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: videos\.raywenderlich\.com/courses| (?:www\.)?raywenderlich\.com )/ (?P<course_id>[^/]+)/lessons/(?P<id>\d+) ''' _TESTS = [{ 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1', 'info_dict': { 'id': '248377018', 'ext': 'mp4', 'title': 'Introduction', 'description': 'md5:804d031b3efa9fcb49777d512d74f722', 'timestamp': 1513906277, 'upload_date': '20171222', 'duration': 133, 'uploader': 'Ray Wenderlich', 'uploader_id': 'user3304672', }, 'params': { 'noplaylist': True, 'skip_download': True, }, 'add_ie': [VimeoIE.ie_key()], 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', 'only_matching': True, }] @staticmethod def _extract_video_id(data, lesson_id): if not data: return groups = try_get(data, lambda x: x['groups'], list) or [] if not groups: return for group in groups: if not isinstance(group, dict): continue contents = try_get(data, lambda x: x['contents'], list) or [] for content in contents: if not isinstance(content, dict): continue ordinal = int_or_none(content.get('ordinal')) if ordinal != lesson_id: continue video_id = content.get('identifier') if video_id: return compat_str(video_id) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) course_id, lesson_id = mobj.group('course_id', 'id') display_id = '%s/%s' % (course_id, lesson_id) webpage = self._download_webpage(url, display_id) thumbnail = self._og_search_thumbnail( webpage, default=None) or self._html_search_meta( 'twitter:image', webpage, 'thumbnail') if '>Subscribe to unlock' in webpage: raise ExtractorError( 'This content is only available for subscribers', expected=True) info = { 'thumbnail': thumbnail, } vimeo_id = self._search_regex( r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None) if not vimeo_id: data = self._parse_json( self._search_regex( r'data-collection=(["\'])(?P<data>{.+?})\1', webpage, 'data collection', default='{}', group='data'), display_id, transform_source=unescapeHTML, fatal=False) video_id = self._extract_video_id( data, lesson_id) or self._search_regex( r'/videos/(\d+)/', thumbnail, 'video id') headers = { 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', } csrf_token = self._html_search_meta( 'csrf-token', webpage, 'csrf token', default=None) if csrf_token: headers['X-CSRF-Token'] = csrf_token video = self._download_json( 'https://videos.raywenderlich.com/api/v1/videos/%s.json' % video_id, display_id, headers=headers)['video'] vimeo_id = video['clips'][0]['provider_id'] info.update({ '_type': 'url_transparent', 'title': video.get('name'), 'description': video.get('description') or video.get( 'meta_description'), 'duration': int_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('created_at')), }) return merge_dicts(info, self.url_result( VimeoIE._smuggle_referrer( 'https://player.vimeo.com/video/%s' % vimeo_id, url), ie=VimeoIE.ie_key(), video_id=vimeo_id)) class RayWenderlichCourseIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: videos\.raywenderlich\.com/courses| (?:www\.)?raywenderlich\.com )/ (?P<id>[^/]+) ''' _TEST = { 'url': 'https://www.raywenderlich.com/3530-testing-in-ios', 'info_dict': { 'title': 'Testing in iOS', 'id': '3530-testing-in-ios', }, 'params': { 'noplaylist': False, }, 'playlist_count': 29, } @classmethod def suitable(cls, url): return False if RayWenderlichIE.suitable(url) else super( RayWenderlichCourseIE, cls).suitable(url) def _real_extract(self, url): course_id = self._match_id(url) webpage = self._download_webpage(url, course_id) entries = [] lesson_urls = set() for lesson_url in re.findall( r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage): if lesson_url in lesson_urls: continue lesson_urls.add(lesson_url) entries.append(self.url_result( urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key())) title = self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', default=None) return self.playlist_result(entries, course_id, title) ================================================ FILE: youtube_dl/extractor/rbgtum.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class RbgTumIE(InfoExtractor): _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' _TESTS = [{ # Combined view 'url': 'https://live.rbg.tum.de/w/cpp/22128', 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', 'info_dict': { 'id': 'cpp/22128', 'ext': 'mp4', 'title': 'Lecture: October 18. 2022', 'series': 'Concepts of C++ programming (IN2377)', } }, { # Presentation only 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', 'md5': '36c584272179f3e56b0db5d880639cba', 'info_dict': { 'id': 'I2DL/12349/PRES', 'ext': 'mp4', 'title': 'Lecture 3: Introduction to Neural Networks', 'series': 'Introduction to Deep Learning (IN2346)', } }, { # Camera only 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', 'md5': 'e04189d92ff2f56aedf5cede65d37aad', 'info_dict': { 'id': 'fvv-info/16130/CAM', 'ext': 'mp4', 'title': 'Fachschaftsvollversammlung', 'series': 'Fachschaftsvollversammlung Informatik', } }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') lecture_series_title = self._html_search_regex( r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) return { 'id': video_id, 'title': lecture_title, 'series': lecture_series_title, 'formats': formats, } class RbgTumCourseIE(InfoExtractor): _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' _TESTS = [{ 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', 'info_dict': { 'title': 'Funktionale Programmierung und Verifikation (IN0003)', 'id': '2022/S/fpv', }, 'params': { 'noplaylist': False, }, 'playlist_count': 13, }, { 'url': 'https://live.rbg.tum.de/course/2022/W/set', 'info_dict': { 'title': 'SET FSMPIC', 'id': '2022/W/set', }, 'params': { 'noplaylist': False, }, 'playlist_count': 6, }, ] def _real_extract(self, url): course_id = self._match_id(url) webpage = self._download_webpage(url, course_id) lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') lecture_urls = [] for lecture_url in re.findall(r'(?i)href="/w/(.+)(?[^/]+)/episodes/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2452, 'timestamp': 1307103164, 'upload_date': '20110603', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) show_id = mobj.group('show_id') episode_id = mobj.group('id') webpage = self._download_webpage(url, episode_id) episode = self._parse_json( self._search_regex( r'__INITIAL_STATE__\s*=\s*({.+?})\s*', webpage, 'json data'), episode_id)['episodes'][show_id][episode_id] title = episode['title'] show_title = episode.get('showTitle') if show_title: title = '%s - %s' % (show_title, title) formats = [{ 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), 'format_id': compat_str(abr), 'abr': abr, 'vcodec': 'none', } for abr in (96, 128, 192, 256)] self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) duration = int_or_none(episode.get('duration')) timestamp = unified_timestamp(episode.get('publishedAt')) return { 'id': episode_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': timestamp, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/rds.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, js_to_json, ) from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ # has two 9c9media ContentPackages, the web player selects the first ContentPackage 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', 'info_dict': { 'id': '2083309', 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', 'ext': 'flv', 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', 'description': 'md5:83fa38ecc4a79b19e433433254077f25', 'timestamp': 1606129030, 'upload_date': '20201123', 'duration': 773.039, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) video_id = compat_str(item['id']) title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r']+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r']+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) timestamp = parse_iso8601(self._search_regex( r']+itemprop="uploadDate"[^>]+content="([^"]+)"', webpage, 'upload date', fatal=False)) duration = parse_duration(self._search_regex( r']+itemprop="duration"[^>]+content="([^"]+)"', webpage, 'duration', fatal=False)) age_limit = self._family_friendly_search(webpage) return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, 'ie_key': 'NineCNineMedia', } ================================================ FILE: youtube_dl/extractor/redbulltv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( float_or_none, ExtractorError, ) class RedBullTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?PAP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', 'md5': 'fb0445b98aa4394e504b413d98031d1f', 'info_dict': { 'id': 'AP-1Q6XCDTAN1W11', 'ext': 'mp4', 'title': 'ABC of... WRC - ABC of... S1E6', 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', 'duration': 1582.04, }, }, { # episode 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', 'info_dict': { 'id': 'AP-1PMHKJFCW1W11', 'ext': 'mp4', 'title': 'Grime - Hashtags S2E4', 'description': 'md5:5546aa612958c08a98faaad4abce484d', 'duration': 904, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173', 'only_matching': True, }, { 'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111', 'only_matching': True, }, { 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', 'only_matching': True, }] def extract_info(self, video_id): session = self._download_json( 'https://api.redbull.tv/v3/session', video_id, note='Downloading access token', query={ 'category': 'personal_computer', 'os_family': 'http', }) if session.get('code') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, session['message'])) token = session['token'] try: video = self._download_json( 'https://api.redbull.tv/v3/products/' + video_id, video_id, note='Downloading video information', headers={'Authorization': token} ) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: error_message = self._parse_json( e.cause.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise title = video['title'].strip() formats = self._extract_m3u8_formats( 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) subtitles = {} for resource in video.get('resources', []): if resource.startswith('closed_caption_'): splitted_resource = resource.split('_') if splitted_resource[2]: subtitles.setdefault('en', []).append({ 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), 'ext': splitted_resource[2], }) subheading = video.get('subheading') if subheading: title += ' - %s' % subheading return { 'id': video_id, 'title': title, 'description': video.get('long_description') or video.get( 'short_description'), 'duration': float_or_none(video.get('duration'), scale=1000), 'formats': formats, 'subtitles': subtitles, } def _real_extract(self, url): video_id = self._match_id(url) return self.extract_info(video_id) class RedBullEmbedIE(RedBullTVIE): _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?Prrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' _TESTS = [{ # HLS manifest accessible only using assetId 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', 'only_matching': True, }] _VIDEO_ESSENSE_TMPL = '''... on %s { videoEssence { attributes } }''' def _real_extract(self, url): rrn_id = self._match_id(url) asset_id = self._download_json( 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', rrn_id, headers={ 'Accept': 'application/json', 'API-KEY': 'e90a1ff11335423998b100c929ecc866', }, query={ 'query': '''{ resource(id: "%s", enforceGeoBlocking: false) { %s %s } }''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), })['data']['resource']['videoEssence']['attributes']['assetId'] return self.extract_info(asset_id) class RedBullTVRrnContentIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P[a-z]{2,3})-(?P[a-z]{2})/tv/(?:video|live|film)/(?Prrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', 'only_matching': True, }] def _real_extract(self, url): region, lang, rrn_id = re.search(self._VALID_URL, url).groups() rrn_id += ':%s-%s' % (lang, region.upper()) return self.url_result( 'https://www.redbull.com/embed/' + rrn_id, RedBullEmbedIE.ie_key(), rrn_id) class RedBullIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P[a-z]{2,3})-(?P[a-z]{2})/(?P(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', 'md5': 'db8271a7200d40053a1809ed0dd574ff', 'info_dict': { 'id': 'AA-1MT8DQWA91W14', 'ext': 'mp4', 'title': 'Grime - Hashtags S2E4', 'description': 'md5:5546aa612958c08a98faaad4abce484d', }, }, { 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', 'only_matching': True, }, { # only available on the int-en website so a fallback is need for the API # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', 'only_matching': True, }] _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] def _real_extract(self, url): region, lang, filter_type, display_id = re.search(self._VALID_URL, url).groups() if filter_type == 'episodes': filter_type = 'episode-videos' elif filter_type == 'live': filter_type = 'live-videos' regions = [region.upper()] if region != 'int': if region in self._LAT_FALLBACK_MAP: regions.append('LAT') if lang in self._INT_FALLBACK_LIST: regions.append('INT') locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) rrn_id = self._download_json( 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, display_id, query={ 'filter[type]': filter_type, 'filter[uriSlug]': display_id, 'rb3Schema': 'v1:hero', })['data']['id'] return self.url_result( 'https://www.redbull.com/embed/' + rrn_id, RedBullEmbedIE.ie_key(), rrn_id) ================================================ FILE: youtube_dl/extractor/reddit.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, float_or_none, try_get, unescapeHTML, url_or_none, ) class RedditIE(InfoExtractor): _VALID_URL = r'https?://v\.redd\.it/(?P[^/?#&]+)' _TEST = { # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ 'url': 'https://v.redd.it/zv89llsvexdz', 'md5': '0a070c53eba7ec4534d95a5a1259e253', 'info_dict': { 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'zv89llsvexdz', }, 'params': { 'format': 'bestvideo', }, } def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': video_id, 'formats': formats, } class RedditRIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', 'duration': 12, 'like_count': int, 'dislike_count': int, 'comment_count': int, 'age_limit': 0, }, 'params': { 'format': 'bestvideo', 'skip_download': True, }, }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, }, { # imgur 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', 'only_matching': True, }, { # imgur @ old reddit 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', 'only_matching': True, }, { # streamable 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', 'only_matching': True, }, { # youtube 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', 'only_matching': True, }, { # reddit video @ nm reddit 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) url, video_id = mobj.group('url', 'id') video_id = self._match_id(url) data = self._download_json( url + '/.json', video_id)[0]['data']['children'][0]['data'] video_url = data['url'] # Avoid recursing into the same reddit URL if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: raise ExtractorError('No media found', expected=True) over_18 = data.get('over_18') if over_18 is True: age_limit = 18 elif over_18 is False: age_limit = 0 else: age_limit = None thumbnails = [] def add_thumbnail(src): if not isinstance(src, dict): return thumbnail_url = url_or_none(src.get('url')) if not thumbnail_url: return thumbnails.append({ 'url': unescapeHTML(thumbnail_url), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), }) for image in try_get(data, lambda x: x['preview']['images']) or []: if not isinstance(image, dict): continue add_thumbnail(image.get('source')) resolutions = image.get('resolutions') if isinstance(resolutions, list): for resolution in resolutions: add_thumbnail(resolution) return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( data, (lambda x: x['media']['reddit_video']['duration'], lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), 'age_limit': age_limit, } ================================================ FILE: youtube_dl/extractor/redtube.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, int_or_none, merge_dicts, str_to_int, unified_strdate, url_or_none, ) class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.redtube.com/66418', 'md5': 'fc08071233725f26b8f014dba9590005', 'info_dict': { 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', 'upload_date': '20110811', 'duration': 596, 'view_count': int, 'age_limit': 18, } }, { 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 'only_matching': True, }, { 'url': 'http://it.redtube.com/66418', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r']+?src=["\'](?P(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', webpage) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.redtube.com/%s' % video_id, video_id) ERRORS = ( (('video-deleted-info', '>This video has been removed'), 'has been removed'), (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), ) for patterns, message in ERRORS: if any(p in webpage for p in patterns): raise ExtractorError( 'Video %s %s' % (video_id, message), expected=True) info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): info['title'] = self._html_search_regex( (r']+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P(?:(?!\1).)+)</h\1>', r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), webpage, 'title', group='title', default=None) or self._og_search_title(webpage) formats = [] sources = self._parse_json( self._search_regex( r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), video_id, fatal=False) if sources and isinstance(sources, dict): for format_id, format_url in sources.items(): if format_url: formats.append({ 'url': format_url, 'format_id': format_id, 'height': int_or_none(format_id), }) medias = self._parse_json( self._search_regex( r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) if medias and isinstance(medias, list): for media in medias: format_url = url_or_none(media.get('videoUrl')) if not format_url: continue if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) continue format_id = media.get('quality') formats.append({ 'url': format_url, 'format_id': format_id, 'height': int_or_none(format_id), }) if not formats: video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url}) self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', webpage, 'upload date', default=None)) duration = int_or_none(self._og_search_property( 'video:duration', webpage, default=None) or self._search_regex( r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), webpage, 'view count', default=None)) # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 return merge_dicts(info, { 'id': video_id, 'ext': 'mp4', 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, 'age_limit': age_limit, 'formats': formats, }) ================================================ FILE: youtube_dl/extractor/regiotv.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( sanitized_Request, xpath_text, xpath_with_ns, ) class RegioTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.regio-tv.de/video/395808.html', 'info_dict': { 'id': '395808', 'ext': 'mp4', 'title': 'Wir in Ludwigsburg', 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', } }, { 'url': 'http://www.regio-tv.de/video/395808', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) key = self._search_regex( r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key') title = self._og_search_title(webpage) SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' request = sanitized_Request( 'http://v.telvi.de/', SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) video_data = self._download_xml(request, video_id, 'Downloading video XML') NS_MAP = { 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', } video_url = xpath_text( video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) thumbnail = xpath_text( video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') description = self._og_search_description( webpage) or self._html_search_meta('description', webpage) return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/rentv.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, url_or_none, ) class RENTVIE(InfoExtractor): _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://ren.tv/video/epizod/118577', 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', 'info_dict': { 'id': '118577', 'ext': 'mp4', 'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"', 'timestamp': 1472230800, 'upload_date': '20160826', } }, { 'url': 'http://ren.tv/player/118577', 'only_matching': True, }, { 'url': 'rentv:118577', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id) config = self._parse_json(self._search_regex( r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id) title = config['title'] formats = [] for video in config['src']: src = url_or_none(video.get('src')) if not src: continue ext = determine_ext(src) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': src, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': config.get('description'), 'thumbnail': config.get('image'), 'duration': int_or_none(config.get('duration')), 'timestamp': int_or_none(config.get('date')), 'formats': formats, } class RENTVArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v', 'md5': 'ebd63c4680b167693745ab91343df1d6', 'info_dict': { 'id': '136472', 'ext': 'mp4', 'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла', 'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.', } }, { # TODO: invalid m3u8 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', 'info_dict': { 'id': 'playlist', 'ext': 'mp4', 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', 'uploader': 'ren.tv', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) drupal_settings = self._parse_json(self._search_regex( r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), display_id) entries = [] for config_profile in drupal_settings.get('ren_jwplayer', {}).values(): media_id = config_profile.get('mediaid') if not media_id: continue media_id = compat_str(media_id) entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id)) return self.playlist_result(entries, display_id) ================================================ FILE: youtube_dl/extractor/restudy.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class RestudyIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.restudy.dk/video/play/id/1637', 'info_dict': { 'id': '1637', 'ext': 'flv', 'title': 'Leiden-frosteffekt', 'description': 'Denne video er et eksperiment med flydende kvælstof.', }, 'params': { # rtmp download 'skip_download': True, } }, { 'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() formats = self._extract_smil_formats( 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/reuters.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( js_to_json, int_or_none, unescapeHTML, ) class ReutersIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562', 'md5': '8015113643a0b12838f160b0b81cc2ee', 'info_dict': { 'id': '368575562', 'ext': 'mp4', 'title': 'San Francisco police chief resigns', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id) video_data = js_to_json(self._search_regex( r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);', webpage, 'video data')) def get_json_value(key, fatal=False): return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal) title = unescapeHTML(get_json_value('title', fatal=True)) mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups() mas_data = self._download_json( 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid), video_id, transform_source=js_to_json) formats = [] for f in mas_data: f_url = f.get('url') if not f_url: continue method = f.get('method') if method == 'hls': formats.extend(self._extract_m3u8_formats( f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: container = f.get('container') ext = '3gp' if method == 'mobile' else container formats.append({ 'format_id': ext, 'url': f_url, 'ext': ext, 'container': container if method != 'mobile' else None, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': get_json_value('thumb'), 'duration': int_or_none(get_json_value('seconds')), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/reverbnation.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( qualities, str_or_none, ) class ReverbNationIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', 'info_dict': { 'id': '16965047', 'ext': 'mp3', 'title': 'MONA LISA', 'uploader': 'ALKILADOS', 'uploader_id': '216429', 'thumbnail': r're:^https?://.*\.jpg', }, }] def _real_extract(self, url): song_id = self._match_id(url) api_res = self._download_json( 'https://api.reverbnation.com/song/%s' % song_id, song_id, note='Downloading information of song %s' % song_id ) THUMBNAILS = ('thumbnail', 'image') quality = qualities(THUMBNAILS) thumbnails = [] for thumb_key in THUMBNAILS: if api_res.get(thumb_key): thumbnails.append({ 'url': api_res[thumb_key], 'preference': quality(thumb_key) }) return { 'id': song_id, 'title': api_res['name'], 'url': api_res['url'], 'uploader': api_res.get('artist', {}).get('name'), 'uploader_id': str_or_none(api_res.get('artist', {}).get('id')), 'thumbnails': thumbnails, 'ext': 'mp3', 'vcodec': 'none', } ================================================ FILE: youtube_dl/extractor/rice.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( xpath_text, xpath_element, int_or_none, parse_iso8601, ExtractorError, ) class RICEIE(InfoExtractor): _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' _TEST = { 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', 'info_dict': { 'id': 'YEWIvbhb40aqdjMD1ALSqw', 'ext': 'mp4', 'title': 'Active Learning in Archeology', 'upload_date': '20140616', 'timestamp': 1402926346, } } _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' def _real_extract(self, url): qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): raise ExtractorError('Invalid URL', expected=True) portal_id = qs['PortalID'][0] playlist_id = qs['DestinationID'][0] content_id = qs['ContentID'][0] content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ 'portalId': portal_id, 'playlistId': playlist_id, 'contentId': content_id }) metadata = xpath_element(content_data, './/metaData', fatal=True) title = xpath_text(metadata, 'primaryTitle', fatal=True) encodings = xpath_element(content_data, './/encodings', fatal=True) player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), 'contentId': content_id, }) common_fmt = {} dimensions = xpath_text(encodings, 'dimensions') if dimensions: wh = dimensions.split('x') if len(wh) == 2: common_fmt.update({ 'width': int_or_none(wh[0]), 'height': int_or_none(wh[1]), }) formats = [] rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) if rtsp_path: fmt = { 'url': rtsp_path, 'format_id': 'rtsp', } fmt.update(common_fmt) formats.append(fmt) for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): video_url = xpath_text(source, self._xpath_ns('File', self._NS)) if not video_url: continue if '.m3u8' in video_url: formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: fmt = { 'url': video_url, 'format_id': video_url.split(':')[0], } fmt.update(common_fmt) rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) if rtmp: fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), 'app': rtmp.group('app'), 'ext': 'flv', }) formats.append(fmt) self._sort_formats(formats) thumbnails = [] for content_asset in content_data.findall('.//contentAssets'): asset_type = xpath_text(content_asset, 'type') if asset_type == 'image': image_url = xpath_text(content_asset, 'httpPath') if not image_url: continue thumbnails.append({ 'id': xpath_text(content_asset, 'ID'), 'url': image_url, }) return { 'id': content_id, 'title': title, 'description': xpath_text(metadata, 'abstract'), 'duration': int_or_none(xpath_text(metadata, 'duration')), 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), 'thumbnails': thumbnails, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/rmcdecouverte.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..compat import ( compat_parse_qs, compat_urlparse, ) from ..utils import smuggle_url class RMCDecouverteIE(InfoExtractor): _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))' _TESTS = [{ 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/', 'info_dict': { 'id': '5983675500001', 'ext': 'mp4', 'title': 'CORVETTE', 'description': 'md5:c1e8295521e45ffebf635d6a7658f506', 'uploader_id': '1969646226001', 'upload_date': '20181226', 'timestamp': 1545861635, }, 'params': { 'skip_download': True, }, 'skip': 'only available for a week', }, { # live, geo restricted, bypassable 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/', 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') or mobj.group('live_id') webpage = self._download_webpage(url, display_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) if brightcove_legacy_url: brightcove_id = compat_parse_qs(compat_urlparse.urlparse( brightcove_legacy_url).query)['@videoPlayer'][0] else: brightcove_id = self._search_regex( r'data-video-id=["\'](\d+)', webpage, 'brightcove id') return self.url_result( smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['FR']}), 'BrightcoveNew', brightcove_id) ================================================ FILE: youtube_dl/extractor/ro220.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote class Ro220IE(InfoExtractor): IE_NAME = '220.ro' _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/', 'md5': '03af18b73a07b4088753930db7a34add', 'info_dict': { 'id': 'LYV6doKo7f', 'ext': 'mp4', 'title': 'Luati-le Banii sez 4 ep 1', 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) url = compat_urllib_parse_unquote(self._search_regex( r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url')) title = self._og_search_title(webpage) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) formats = [{ 'format_id': 'sd', 'url': url, 'ext': 'mp4', }] return { 'id': video_id, 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/rockstargames.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, ) class RockstarGamesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.rockstargames.com/videos/video/11544/', 'md5': '03b5caa6e357a4bd50e3143fc03e5733', 'info_dict': { 'id': '11544', 'ext': 'mp4', 'title': 'Further Adventures in Finance and Felony Trailer', 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1464876000, 'upload_date': '20160602', } }, { 'url': 'http://www.rockstargames.com/videos#/?video=48', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'https://www.rockstargames.com/videoplayer/videos/get-video.json', video_id, query={ 'id': video_id, 'locale': 'en_us', })['video'] title = video['title'] formats = [] for video in video['files_processed']['video/mp4']: if not video.get('src'): continue resolution = video.get('resolution') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', resolution or '', 'height', default=None)) formats.append({ 'url': self._proto_relative_url(video['src']), 'format_id': resolution, 'height': height, }) if not formats: youtube_id = video.get('youtube_id') if youtube_id: return self.url_result(youtube_id, 'Youtube') self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': video.get('description'), 'thumbnail': self._proto_relative_url(video.get('screencap')), 'timestamp': parse_iso8601(video.get('created')), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/roosterteeth.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( ExtractorError, int_or_none, str_or_none, urlencode_postdata, ) class RoosterTeethIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'roosterteeth' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'md5': 'e2bd7764732d785ef797700a2489f212', 'info_dict': { 'id': '9156', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'ext': 'mp4', 'title': 'Million Dollars, But... The Game Announcement', 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', 'thumbnail': r're:^https?://.*\.png$', 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', }, }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'only_matching': True, }, { 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', 'only_matching': True, }, { 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', 'only_matching': True, }, { 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', 'only_matching': True, }, { # only available for FIRST members 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', 'only_matching': True, }, { 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/' def _login(self): username, password = self._get_login_info() if username is None: return try: self._download_json( 'https://auth.roosterteeth.com/oauth/token', None, 'Logging in', data=urlencode_postdata({ 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', 'grant_type': 'password', 'username': username, 'password': password, })) except ExtractorError as e: msg = 'Unable to login' if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json(e.cause.read().decode(), None, fatal=False) if resp: error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') if error: msg += ': ' + error self.report_warning(msg) def _real_initialize(self): if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): return self._login() def _real_extract(self, url): display_id = self._match_id(url) api_episode_url = self._EPISODE_BASE_URL + display_id try: m3u8_url = self._download_json( api_episode_url + '/videos', display_id, 'Downloading video JSON metadata')['data'][0]['attributes']['url'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: self.raise_login_required( '%s is only available for FIRST members' % display_id) raise formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] attributes = episode['attributes'] title = attributes.get('title') or attributes['display_title'] video_id = compat_str(episode['id']) thumbnails = [] for image in episode.get('included', {}).get('images', []): if image.get('type') == 'episode_image': img_attributes = image.get('attributes') or {} for k in ('thumb', 'small', 'medium', 'large'): img_url = img_attributes.get(k) if img_url: thumbnails.append({ 'id': k, 'url': img_url, }) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': attributes.get('description') or attributes.get('caption'), 'thumbnails': thumbnails, 'series': attributes.get('show_title'), 'season_number': int_or_none(attributes.get('season_number')), 'season_id': attributes.get('season_id'), 'episode': title, 'episode_number': int_or_none(attributes.get('number')), 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, 'channel_id': attributes.get('channel_id'), 'duration': int_or_none(attributes.get('length')), } ================================================ FILE: youtube_dl/extractor/rottentomatoes.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from .internetvideoarchive import InternetVideoArchiveIE class RottenTomatoesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', 'thumbnail': r're:^https?://.*\.jpg$', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') return { '_type': 'url_transparent', 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, 'ie_key': InternetVideoArchiveIE.ie_key(), 'id': video_id, 'title': self._og_search_title(webpage), } ================================================ FILE: youtube_dl/extractor/roxwel.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import unified_strdate, determine_ext class RoxwelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' _TEST = { 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', 'info_dict': { 'id': 'passionpittakeawalklive', 'ext': 'flv', 'title': 'Take A Walk (live)', 'uploader': 'Passion Pit', 'uploader_id': 'passionpit', 'upload_date': '20120928', 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', }, 'params': { # rtmp download 'skip_download': True, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) filename = mobj.group('filename') info_url = 'http://www.roxwel.com/api/videos/%s' % filename info = self._download_json(info_url, filename) rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) best_rate = rtmp_rates[-1] url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') ext = determine_ext(rtmp_url) if ext == 'f4v': rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) return { 'id': filename, 'title': info['title'], 'url': rtmp_url, 'ext': 'flv', 'description': info['description'], 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), 'uploader': info['artist'], 'uploader_id': info['artistname'], 'upload_date': unified_strdate(info['dbdate']), } ================================================ FILE: youtube_dl/extractor/rozhlas.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, remove_start, ) class RozhlasIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://prehravac.rozhlas.cz/audio/3421320', 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', 'info_dict': { 'id': '3421320', 'ext': 'mp3', 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' } }, { 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', 'only_matching': True, }] def _real_extract(self, url): audio_id = self._match_id(url) webpage = self._download_webpage( 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) title = self._html_search_regex( r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', webpage, 'title', default=None) or remove_start( self._og_search_title(webpage), 'Radio Wave - ') description = self._html_search_regex( r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', webpage, 'description', fatal=False, group='url') duration = int_or_none(self._search_regex( r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) return { 'id': audio_id, 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, 'title': title, 'description': description, 'duration': duration, 'vcodec': 'none', } ================================================ FILE: youtube_dl/extractor/rtbf.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, strip_or_none, ) class RTBFIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)?rtbf\.be/ (?: video/[^?]+\?.*\bid=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| auvio/[^/]+\?.*\b(?P<live>l)?id= )(?P<id>\d+)''' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '8c876a1cceeb6cf31b476461ade72384', 'info_dict': { 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', 'description': '(du 25/04/2014)', 'duration': 3099.54, 'upload_date': '20140425', 'timestamp': 1398456300, } }, { # geo restricted 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', 'only_matching': True, }, { 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', 'only_matching': True, }, { 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'only_matching': True, }, { # Live 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', 'only_matching': True, }, { # Audio 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', 'only_matching': True, }, { # With Subtitle 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', 'only_matching': True, }] _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _PROVIDERS = { 'YOUTUBE': 'Youtube', 'DAILYMOTION': 'Dailymotion', 'VIMEO': 'Vimeo', } _QUALITIES = [ ('mobile', 'SD'), ('web', 'MD'), ('high', 'HD'), ] def _real_extract(self, url): live, media_id = re.match(self._VALID_URL, url).groups() embed_page = self._download_webpage( 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), media_id, query={'id': media_id}) data = self._parse_json(self._html_search_regex( r'data-media="([^"]+)"', embed_page, 'media data'), media_id) error = data.get('error') if error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) provider = data.get('provider') if provider in self._PROVIDERS: return self.url_result(data['url'], self._PROVIDERS[provider]) title = data['title'] is_live = data.get('isLive') if is_live: title = self._live_title(title) height_re = r'-(\d+)p\.' formats = [] m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x http_url = data.get('url') if formats and http_url and re.search(height_re, http_url): http_url = fix_url(http_url) for m3u8_f in formats[:]: height = m3u8_f.get('height') if not height: continue f = m3u8_f.copy() del f['protocol'] f.update({ 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), 'url': re.sub(height_re, '-%dp.' % height, http_url), }) formats.append(f) else: sources = data.get('sources') or {} for key, format_id in self._QUALITIES: format_url = sources.get(key) if not format_url: continue height = int_or_none(self._search_regex( height_re, format_url, 'height', default=None)) formats.append({ 'format_id': format_id, 'url': fix_url(format_url), 'height': height, }) mpd_url = data.get('urlDash') if not data.get('drm') and mpd_url: formats.extend(self._extract_mpd_formats( mpd_url, media_id, mpd_id='dash', fatal=False)) audio_url = data.get('urlAudio') if audio_url: formats.append({ 'format_id': 'audio', 'url': audio_url, 'vcodec': 'none', }) self._sort_formats(formats) subtitles = {} for track in (data.get('tracks') or {}).values(): sub_url = track.get('url') if not sub_url: continue subtitles.setdefault(track.get('lang') or 'fr', []).append({ 'url': sub_url, }) return { 'id': media_id, 'formats': formats, 'title': title, 'description': strip_or_none(data.get('description')), 'thumbnail': data.get('thumbnail'), 'duration': float_or_none(data.get('realDuration')), 'timestamp': int_or_none(data.get('liveFrom')), 'series': data.get('programLabel'), 'subtitles': subtitles, 'is_live': is_live, } ================================================ FILE: youtube_dl/extractor/rte.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, str_or_none, try_get, unescapeHTML, url_or_none, ExtractorError, ) class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) info_dict = {} formats = [] ENDPOINTS = ( 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=', 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=', ) for num, ep_url in enumerate(ENDPOINTS, start=1): try: data = self._download_json(ep_url + item_id, item_id) except ExtractorError as ee: if num < len(ENDPOINTS) or formats: continue if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) if error_info: raise ExtractorError( '%s said: %s' % (self.IE_NAME, error_info['message']), expected=True) raise # NB the string values in the JSON are stored using XML escaping(!) show = try_get(data, lambda x: x['shows'][0], dict) if not show: continue if not info_dict: title = unescapeHTML(show['title']) description = unescapeHTML(show.get('description')) thumbnail = show.get('thumbnail') duration = float_or_none(show.get('duration'), 1000) timestamp = parse_iso8601(show.get('published')) info_dict = { 'id': item_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, } mg = try_get(show, lambda x: x['media:group'][0], dict) if not mg: continue if mg.get('url'): m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url']) if m: m = m.groupdict() formats.append({ 'url': m['url'] + '/' + m['app'], 'app': m['app'], 'play_path': m['playpath'], 'player_url': url, 'ext': 'flv', 'format_id': 'rtmp', }) if mg.get('hls_server') and mg.get('hls_url'): formats.extend(self._extract_m3u8_formats( mg['hls_server'] + mg['hls_url'], item_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) if mg.get('hds_server') and mg.get('hds_url'): formats.extend(self._extract_f4m_formats( mg['hds_server'] + mg['hds_url'], item_id, f4m_id='hds', fatal=False)) mg_rte_server = str_or_none(mg.get('rte:server')) mg_url = str_or_none(mg.get('url')) if mg_rte_server and mg_url: hds_url = url_or_none(mg_rte_server + mg_url) if hds_url: formats.extend(self._extract_f4m_formats( hds_url, item_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) info_dict['formats'] = formats return info_dict class RteIE(RteBaseIE): IE_NAME = 'rte' IE_DESC = 'Raidió Teilifís Éireann TV' _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', 'md5': '4a76eb3396d98f697e6e8110563d2604', 'info_dict': { 'id': '10478715', 'ext': 'mp4', 'title': 'iWitness', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'The spirit of Ireland, one voice and one minute at a time.', 'duration': 60.046, 'upload_date': '20151012', 'timestamp': 1444694160, }, } class RteRadioIE(RteBaseIE): IE_NAME = 'rte:radio' IE_DESC = 'Raidió Teilifís Éireann radio' # Radioplayer URLs have two distinct specifier formats, # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>: # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_ # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. # An <id> uniquely defines an individual recording, and is the only part we require. _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)' _TESTS = [{ # Old-style player URL; HLS and RTMPE formats 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', 'md5': 'c79ccb2c195998440065456b69760411', 'info_dict': { 'id': '10507902', 'ext': 'mp4', 'title': 'Gloria', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', 'timestamp': 1451203200, 'upload_date': '20151227', 'duration': 7230.0, }, }, { # New-style player URL; RTMPE formats only 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', 'info_dict': { 'id': '3250678', 'ext': 'flv', 'title': 'The Lyric Concert with Paul Herriott', 'thumbnail': r're:^https?://.*\.jpg$', 'description': '', 'timestamp': 1333742400, 'upload_date': '20120406', 'duration': 7199.016, }, 'params': { # rtmp download 'skip_download': True, }, }] ================================================ FILE: youtube_dl/extractor/rtl2.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( compat_b64decode, compat_ord, compat_str, ) from ..utils import ( bytes_to_intlist, ExtractorError, intlist_to_bytes, int_or_none, strip_or_none, ) class RTL2IE(InfoExtractor): IE_NAME = 'rtl2' _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', 'info_dict': { 'id': 'folge-203-0', 'ext': 'f4v', 'title': 'GRIP sucht den Sommerkönig', 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' }, 'params': { # rtmp download 'skip_download': True, }, 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', 'info_dict': { 'id': 'anna-erwischt-alex', 'ext': 'mp4', 'title': 'Anna erwischt Alex!', 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' }, 'params': { # rtmp download 'skip_download': True, }, 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }] def _real_extract(self, url): vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups() if not vico_id: webpage = self._download_webpage(url, display_id) mobj = re.search( r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', webpage) if mobj: vico_id = mobj.group('vico_id') vivi_id = mobj.group('vivi_id') else: vico_id = self._html_search_regex( r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') vivi_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') info = self._download_json( 'https://service.rtl2.de/api-player-vipo/video.php', display_id, query={ 'vico_id': vico_id, 'vivi_id': vivi_id, }) video_info = info['video'] title = video_info['titel'] formats = [] rtmp_url = video_info.get('streamurl') if rtmp_url: rtmp_url = rtmp_url.replace('\\', '') stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] formats.append({ 'format_id': 'rtmp', 'url': rtmp_url, 'play_path': stream_url, 'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf', 'page_url': url, 'flash_version': 'LNX 11,2,202,429', 'rtmp_conn': rtmp_conn, 'no_resume': True, 'preference': 1, }) m3u8_url = video_info.get('streamurl_hls') if m3u8_url: formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) self._sort_formats(formats) return { 'id': display_id, 'title': title, 'thumbnail': video_info.get('image'), 'description': video_info.get('beschreibung'), 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } class RTL2YouBaseIE(InfoExtractor): _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' class RTL2YouIE(RTL2YouBaseIE): IE_NAME = 'rtl2:you' _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', 'info_dict': { 'id': '15740', 'ext': 'mp4', 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', 'age_limit': 12, }, }, { 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', 'only_matching': True, }] _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' _GEO_COUNTRIES = ['DE'] def _real_extract(self, url): video_id = self._match_id(url) stream_data = self._download_json( self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') stream_url = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(data)), bytes_to_intlist(self._AES_KEY), bytes_to_intlist(compat_b64decode(iv)) )) if b'rtl2_you_video_not_found' in stream_url: raise ExtractorError('video not found', expected=True) formats = self._extract_m3u8_formats( stream_url[:-compat_ord(stream_url[-1])].decode(), video_id, 'mp4', 'm3u8_native') self._sort_formats(formats) video_data = self._download_json( self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) series = video_data.get('formatTitle') title = episode = video_data.get('title') or series if series and series != title: title = '%s - %s' % (series, title) return { 'id': video_id, 'title': title, 'formats': formats, 'description': strip_or_none(video_data.get('description')), 'thumbnail': video_data.get('image'), 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), 'series': series, 'episode': episode, 'age_limit': int_or_none(video_data.get('minimumAge')), } class RTL2YouSeriesIE(RTL2YouBaseIE): IE_NAME = 'rtl2:you:series' _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' _TEST = { 'url': 'http://you.rtl2.de/videos/115/dragon-ball', 'info_dict': { 'id': '115', }, 'playlist_mincount': 5, } def _real_extract(self, url): series_id = self._match_id(url) stream_data = self._download_json( self._BACKWERK_BASE_URL + 'videos', series_id, query={ 'formatId': series_id, 'limit': 1000000000, }) entries = [] for video in stream_data.get('videos', []): video_id = compat_str(video['videoId']) if not video_id: continue entries.append(self.url_result( 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), 'RTL2You', video_id)) return self.playlist_result(entries, series_id) ================================================ FILE: youtube_dl/extractor/rtlnl.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_duration, ) class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' _VALID_URL = r'''(?x) https?://(?:(?:www|static)\.)? (?: rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/| rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)| embed\.rtl\.nl/\#uuid= ) (?P<id>[0-9a-f-]+)''' _TESTS = [{ # new URL schema 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f', 'md5': '490428f1187b60d714f34e1f2e3af0b6', 'info_dict': { 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f', 'ext': 'mp4', 'title': 'RTL Nieuws', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'timestamp': 1593293400, 'upload_date': '20200627', 'duration': 661.08, }, }, { # old URL schema 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'ext': 'mp4', 'title': 'RTL Nieuws', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'timestamp': 1461951000, 'upload_date': '20160429', 'duration': 1167.96, }, 'skip': '404', }, { # best format available a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed', 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } }, { # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275) # best format available nettv 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', 'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, }, 'params': { 'skip_download': True, }, }, { # encrypted m3u8 streams, georestricted 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', 'only_matching': True, }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, }, { 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', 'only_matching': True, }, { 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', 'only_matching': True, }, { 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', 'only_matching': True, }, { # new embed URL schema 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'only_matching': True, }] def _real_extract(self, url): uuid = self._match_id(url) info = self._download_json( 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] title = info['abstracts'][0]['name'] subtitle = material.get('title') if subtitle: title += ' - %s' % subtitle description = material.get('synopsis') meta = info.get('meta', {}) videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats( m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) self._sort_formats(formats) thumbnails = [] for p in ('poster_base_url', '"thumb_base_url"'): if not meta.get(p): continue thumbnails.append({ 'url': self._proto_relative_url(meta[p] + uuid), 'width': int_or_none(self._search_regex( r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)), 'height': int_or_none(self._search_regex( r'/sz=[0-9]+x([0-9]+)', meta[p], 'thumbnail height', fatal=False)) }) return { 'id': uuid, 'title': title, 'formats': formats, 'timestamp': material['original_date'], 'description': description, 'duration': parse_duration(material.get('duration')), 'thumbnails': thumbnails, } ================================================ FILE: youtube_dl/extractor/rtp.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, ) class RTPIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', 'info_dict': { 'id': 'e174042', 'ext': 'mp3', 'title': 'Paixões Cruzadas', 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta( 'twitter:title', webpage, display_name='title', fatal=True) config = self._parse_json(self._search_regex( r'(?s)RTPPlayer\(({.+?})\);', webpage, 'player config'), video_id, js_to_json) file_url = config['file'] ext = determine_ext(file_url) if ext == 'm3u8': file_key = config.get('fileKey') formats = self._extract_m3u8_formats( file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=file_key) if file_key: formats.append({ 'url': 'https://cdn-ondemand.rtp.pt' + file_key, 'preference': 1, }) self._sort_formats(formats) else: formats = [{ 'url': file_url, 'ext': ext, }] if config.get('mediaType') == 'audio': for f in formats: f['vcodec'] = 'none' return { 'id': video_id, 'title': title, 'formats': formats, 'description': self._html_search_meta(['description', 'twitter:description'], webpage), 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), } ================================================ FILE: youtube_dl/extractor/rts.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, urljoin, ) class RTSIE(SRGSSRIE): IE_DESC = 'RTS.ch' _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', 'ext': 'mp4', 'duration': 1488, 'title': 'Les Enfants Terribles', 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', 'uploader': 'Divers', 'upload_date': '19680921', 'timestamp': -40280400, 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', 'info_dict': { 'id': '5624065', 'title': 'Passe-moi les jumelles', }, 'playlist_mincount': 4, }, { 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', 'info_dict': { 'id': '5745975', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', 'ext': 'mp4', 'duration': 48, 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', 'description': 'Hockey - Playoff', 'uploader': 'Hockey', 'upload_date': '20140403', 'timestamp': 1396556882, 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, }, 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', 'ext': 'mp4', 'duration': 33, 'title': 'Londres cachée par un épais smog', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', 'uploader': 'L\'actu en vidéo', 'upload_date': '20140403', 'timestamp': 1396537322, 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae', 'info_dict': { 'id': '5706148', 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014', 'ext': 'mp3', 'duration': 123, 'title': '"Urban Hippie", de Damien Krisl', 'description': 'Des Hippies super glam.', 'upload_date': '20140403', 'timestamp': 1396551600, }, }, { # article with videos on rhs 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', 'info_dict': { 'id': '6693917', 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', }, 'playlist_mincount': 5, }, { 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', 'only_matching': True, } ] def _real_extract(self, url): m = re.match(self._VALID_URL, url) media_id = m.group('rts_id') or m.group('id') display_id = m.group('display_id') or media_id def download_json(internal_id): return self._download_json( 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id, display_id) all_info = download_json(media_id) # media_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: entries = [] for item in all_info.get('items', []): item_url = item.get('url') if not item_url: continue entries.append(self.url_result(item_url, 'RTS')) if not entries: page, urlh = self._download_webpage_handle(url, display_id) if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: return self.url_result(urlh.geturl(), 'RTS') # article with videos on rhs videos = re.findall( r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"', page) if not videos: videos = re.findall( r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', page) if videos: entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] if entries: return self.playlist_result(entries, media_id, all_info.get('title')) internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, 'internal video id') all_info = download_json(internal_id) media_type = 'video' if 'video' in all_info else 'audio' # check for errors self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] title = info['title'] def extract_bitrate(url): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) formats = [] streams = info.get('streams', {}) for format_id, format_url in streams.items(): if format_id == 'hds_sd' and 'hds' in streams: continue if format_id == 'hls_sd' and 'hls' in streams: continue ext = determine_ext(format_url) if ext in ('m3u8', 'f4m'): format_url = self._get_tokenized_src(format_url, media_id, format_id) if ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', media_id, f4m_id=format_id, fatal=False)) else: formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, 'url': format_url, 'tbr': extract_bitrate(format_url), }) download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): continue rate = media.get('rate') ext = media.get('ext') or determine_ext(media_url, 'mp4') format_id = ext if rate: format_id += '-%dk' % rate formats.append({ 'format_id': format_id, 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) self._check_formats(formats, media_id) self._sort_formats(formats) duration = info.get('duration') or info.get('cutout') or info.get('cutduration') if isinstance(duration, compat_str): duration = parse_duration(duration) return { 'id': media_id, 'display_id': display_id, 'formats': formats, 'title': title, 'description': info.get('intro'), 'duration': duration, 'view_count': int_or_none(info.get('plays')), 'uploader': info.get('programName'), 'timestamp': parse_iso8601(info.get('broadcast_date')), 'thumbnail': unescapeHTML(info.get('preview_image_url')), } ================================================ FILE: youtube_dl/extractor/rtve.py ================================================ # coding: utf-8 from __future__ import unicode_literals import base64 import io import re import sys from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_struct_unpack, ) from ..utils import ( determine_ext, ExtractorError, float_or_none, qualities, remove_end, remove_start, std_headers, ) _bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { 'id': '2491869', 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, 'series': 'Balonmano', }, 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, }] def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, None, 'Fetching manager info')['manager'] @staticmethod def _decrypt_url(png): encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) while True: length = compat_struct_unpack('!I', encrypted_data.read(4))[0] chunk_type = encrypted_data.read(4) if chunk_type == b'IEND': break data = encrypted_data.read(length) if chunk_type == b'tEXt': alphabet_data, text = data.split(b'\0') quality, url_data = text.split(b'%%') alphabet = [] e = 0 d = 0 for l in _bytes_to_chr(alphabet_data): if d == 0: alphabet.append(l) d = e = (e + 1) % 4 else: d -= 1 url = '' f = 0 e = 3 b = 1 for letter in _bytes_to_chr(url_data): if f == 0: l = int(letter) * 10 f = 1 else: if e == 0: l += int(letter) url += alphabet[l] e = (b + 3) % 4 f = 0 b += 1 else: e -= 1 yield quality.decode(), url encrypted_data.read(4) # CRC def _extract_png_formats(self, video_id): png = self._download_webpage( 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), video_id, 'Downloading url information', query={'q': 'v2'}) q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) formats = [] for quality, video_url in self._decrypt_url(png): ext = determine_ext(video_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, 'dash', fatal=False)) else: formats.append({ 'format_id': quality, 'quality': q(quality), 'url': video_url, }) self._sort_formats(formats) return formats def _real_extract(self, url): video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) title = info['title'].strip() formats = self._extract_png_formats(video_id) subtitles = None sbt_file = info.get('sbtFile') if sbt_file: subtitles = self.extract_subtitles(video_id, sbt_file) is_live = info.get('live') is True return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), 'subtitles': subtitles, 'duration': float_or_none(info.get('duration'), 1000), 'is_live': is_live, 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): subs = self._download_json( sub_file + '.json', video_id, 'Downloading subtitles info')['page']['items'] return dict( (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) for s in subs) class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') return { 'id': video_id, 'title': self._live_title(title), 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' _TEST = { 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', 'info_dict': { 'id': '3069778', 'ext': 'mp4', 'title': 'Documentos TV - La revolución del móvil', 'duration': 3496.948, }, 'params': { 'skip_download': True, }, } def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) alacarta_url = self._search_regex( r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', webpage, 'alacarta url', default=None) if alacarta_url is None: raise ExtractorError( 'The webpage doesn\'t contain any video', expected=True) return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) ================================================ FILE: youtube_dl/extractor/rtvnh.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ExtractorError class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', 'thumbnail': r're:^https?:.*\.jpg$' } } def _real_extract(self, url): video_id = self._match_id(url) meta = self._parse_json(self._download_webpage( 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) status = meta.get('status') if status != 200: raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) formats = [] rtmp_formats = self._extract_smil_formats( 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) formats.extend(rtmp_formats) for rtmp_format in rtmp_formats: rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) rtsp_format = rtmp_format.copy() del rtsp_format['play_path'] del rtsp_format['ext'] rtsp_format.update({ 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), 'url': rtmp_url.replace('rtmp://', 'rtsp://'), 'protocol': 'rtsp', }) formats.append(rtsp_format) http_base_url = rtmp_url.replace('rtmp://', 'http://') formats.extend(self._extract_m3u8_formats( http_base_url + '/playlist.m3u8', video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( http_base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'title': meta['title'].strip(), 'thumbnail': meta.get('image'), 'formats': formats } ================================================ FILE: youtube_dl/extractor/rtvs.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class RTVSIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)' _TESTS = [{ # radio archive 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', 'md5': '134d5d6debdeddf8a5d761cbc9edacb8', 'info_dict': { 'id': '414872', 'ext': 'mp3', 'title': 'Ostrov pokladov 1 časť.mp3' }, 'params': { 'skip_download': True, } }, { # tv archive 'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118', 'md5': '85e2c55cf988403b70cac24f5c086dc6', 'info_dict': { 'id': '63118', 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.' }, 'params': { 'skip_download': True, } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) playlist_url = self._search_regex( r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'playlist url', group='url') data = self._download_json( playlist_url, video_id, 'Downloading playlist')[0] return self._parse_jwplayer_data(data, video_id=video_id) ================================================ FILE: youtube_dl/extractor/ruhd.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class RUHDIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', 'info_dict': { 'id': '207', 'ext': 'divx', 'title': 'КОТ бааааам', 'description': 'классный кот)', 'thumbnail': r're:^http://.*\.jpg$', } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'<param name="src" value="([^"]+)"', webpage, 'video url') title = self._html_search_regex( r'<title>([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', webpage, 'title') description = self._html_search_regex( r'(?s)
(.+?)', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'[0-9a-z]+)' _TESTS = [{ 'url': 'https://rumble.com/embed/v5pv5f', 'md5': '36a18a049856720189f30977ccbb2c34', 'info_dict': { 'id': 'v5pv5f', 'ext': 'mp4', 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', 'timestamp': 1571611968, 'upload_date': '20191020', } }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) title = video['title'] formats = [] for height, ua in (video.get('ua') or {}).items(): for i in range(2): f_url = try_get(ua, lambda x: x[i], compat_str) if f_url: ext = determine_ext(f_url) f = { 'ext': ext, 'format_id': '%s-%sp' % (ext, height), 'height': int_or_none(height), 'url': f_url, } bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) if bitrate: f['tbr'] = int_or_none(bitrate) formats.append(f) self._sort_formats(formats) author = video.get('author') or {} return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': video.get('i'), 'timestamp': parse_iso8601(video.get('pubDate')), 'channel': author.get('name'), 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), } ================================================ FILE: youtube_dl/extractor/rutube.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor from ..compat import ( compat_str, compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, bool_or_none, int_or_none, try_get, unified_timestamp, url_or_none, ) class RutubeBaseIE(InfoExtractor): def _download_api_info(self, video_id, query=None): if not query: query = {} query['format'] = 'json' return self._download_json( 'http://rutube.ru/api/video/%s/' % video_id, video_id, 'Downloading video JSON', 'Unable to download video JSON', query=query) @staticmethod def _extract_info(video, video_id=None, require_title=True): title = video['title'] if require_title else video.get('title') age_limit = video.get('is_adult') if age_limit is not None: age_limit = 18 if age_limit is True else 0 uploader_id = try_get(video, lambda x: x['author']['id']) category = try_get(video, lambda x: x['category']['name']) return { 'id': video.get('id') or video_id if video_id else video['id'], 'title': title, 'description': video.get('description'), 'thumbnail': video.get('thumbnail_url'), 'duration': int_or_none(video.get('duration')), 'uploader': try_get(video, lambda x: x['author']['name']), 'uploader_id': compat_str(uploader_id) if uploader_id else None, 'timestamp': unified_timestamp(video.get('created_ts')), 'category': [category] if category else None, 'age_limit': age_limit, 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), 'is_live': bool_or_none(video.get('is_livestream')), } def _download_and_extract_info(self, video_id, query=None): return self._extract_info( self._download_api_info(video_id, query=query), video_id) def _download_api_options(self, video_id, query=None): if not query: query = {} query['format'] = 'json' return self._download_json( 'http://rutube.ru/api/play/options/%s/' % video_id, video_id, 'Downloading options JSON', 'Unable to download options JSON', headers=self.geo_verification_headers(), query=query) def _extract_formats(self, options, video_id): formats = [] for format_id, format_url in options['video_balancer'].items(): ext = determine_ext(format_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url, video_id, f4m_id=format_id, fatal=False)) else: formats.append({ 'url': format_url, 'format_id': format_id, }) self._sort_formats(formats) return formats def _download_and_extract_formats(self, video_id, query=None): return self._extract_formats( self._download_api_options(video_id, query=query), video_id) class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'md5': '1d24f180fac7a02f3900712e5a5764d6', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 81, 'uploader': 'NTDRussian', 'uploader_id': '29790', 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', 'only_matching': True, }, { 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', 'only_matching': True, }] @classmethod def suitable(cls, url): return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) @staticmethod def _extract_urls(webpage): return [mobj.group('url') for mobj in re.finditer( r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', webpage)] def _real_extract(self, url): video_id = self._match_id(url) info = self._download_and_extract_info(video_id) info['formats'] = self._download_and_extract_formats(video_id) return info class RutubeEmbedIE(RutubeBaseIE): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', 'ext': 'mp4', 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix

восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', 'uploader': 'subziro89 ILya', 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', 'only_matching': True, }, { # private video 'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ', 'only_matching': True, }] def _real_extract(self, url): embed_id = self._match_id(url) # Query may contain private videos token and should be passed to API # requests (see #19163) query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) options = self._download_api_options(embed_id, query) video_id = options['effective_video'] formats = self._extract_formats(options, video_id) info = self._download_and_extract_info(video_id, query) info.update({ 'extractor_key': 'Rutube', 'formats': formats, }) return info class RutubePlaylistBaseIE(RutubeBaseIE): def _next_page_url(self, page_num, playlist_id, *args, **kwargs): return self._PAGE_TEMPLATE % (playlist_id, page_num) def _entries(self, playlist_id, *args, **kwargs): next_page_url = None for pagenum in itertools.count(1): page = self._download_json( next_page_url or self._next_page_url( pagenum, playlist_id, *args, **kwargs), playlist_id, 'Downloading page %s' % pagenum) results = page.get('results') if not results or not isinstance(results, list): break for result in results: video_url = url_or_none(result.get('video_url')) if not video_url: continue entry = self._extract_info(result, require_title=False) entry.update({ '_type': 'url', 'url': video_url, 'ie_key': RutubeIE.ie_key(), }) yield entry next_page_url = page.get('next') if not next_page_url or not page.get('has_next'): break def _extract_playlist(self, playlist_id, *args, **kwargs): return self.playlist_result( self._entries(playlist_id, *args, **kwargs), playlist_id, kwargs.get('playlist_name')) def _real_extract(self, url): return self._extract_playlist(self._match_id(url)) class RutubeChannelIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { 'id': '1800', }, 'playlist_mincount': 68, }] _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' def _real_extract(self, url): movie_id = self._match_id(url) movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') return self._extract_playlist( movie_id, playlist_name=movie.get('name')) class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { 'id': '313878', }, 'playlist_mincount': 37, }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' class RutubePlaylistIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:playlist' IE_DESC = 'Rutube playlists' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P\d+)' _TESTS = [{ 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', 'info_dict': { 'id': '3097', }, 'playlist_count': 27, }, { 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', 'only_matching': True, }] _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' @classmethod def suitable(cls, url): if not super(RutubePlaylistIE, cls).suitable(url): return False params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) def _next_page_url(self, page_num, playlist_id, item_kind): return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) def _real_extract(self, url): qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) playlist_kind = qs['pl_type'][0] playlist_id = qs['pl_id'][0] return self._extract_playlist(playlist_id, item_kind=playlist_kind) ================================================ FILE: youtube_dl/extractor/rutv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, str_to_int ) class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) https?:// (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ (?P flash\d+v/container\.swf\?id=| iframe/(?Pswf|video|live)/id/| index/iframe/cast_id/ ) (?P\d+) ''' _TESTS = [ { 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724', 'info_dict': { 'id': '774471', 'ext': 'mp4', 'title': 'Монологи на все времена', 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5', 'duration': 2906, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638', 'info_dict': { 'id': '774016', 'ext': 'mp4', 'title': 'Чужой в семье Сталина', 'description': '', 'duration': 2539, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000', 'info_dict': { 'id': '766888', 'ext': 'mp4', 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"', 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995', 'duration': 279, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169', 'info_dict': { 'id': '771852', 'ext': 'mp4', 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет', 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8', 'duration': 3096, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014', 'info_dict': { 'id': '51499', 'ext': 'flv', 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ', 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c', }, 'skip': 'Translation has finished', }, { 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/', 'info_dict': { 'id': '21', 'ext': 'mp4', 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', 'only_matching': True, }, ] @classmethod def _extract_url(cls, webpage): mobj = re.search( r']+?src=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') mobj = re.search( r']+?property=(["\'])og:video\1[^>]+?content=(["\'])(?Phttps?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') video_path = mobj.group('path') if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') if video_type == 'swf': video_type = 'video' elif video_path.startswith('index/iframe/cast_id'): video_type = 'live' is_live = video_type == 'live' json_data = self._download_json( 'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id), video_id, 'Downloading JSON') if json_data['errors']: raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True) playlist = json_data['data']['playlist'] medialist = playlist['medialist'] media = medialist[0] if media['errors']: raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) view_count = playlist.get('count_views') priority_transport = playlist['priority_transport'] thumbnail = media['picture'] width = int_or_none(media['width']) height = int_or_none(media['height']) description = media['anons'] title = media['title'] duration = int_or_none(media.get('duration')) formats = [] for transport, links in media['sources'].items(): for quality, url in links.items(): preference = -1 if priority_transport == transport else -2 if transport == 'rtmp': mobj = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?P.+)$', url) if not mobj: continue fmt = { 'url': mobj.group('url'), 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': str_to_int(quality), 'preference': preference, } elif transport == 'm3u8': formats.extend(self._extract_m3u8_formats( url, video_id, 'mp4', preference=preference, m3u8_id='hls')) continue else: fmt = { 'url': url } fmt.update({ 'width': width, 'height': height, 'format_id': '%s-%s' % (transport, quality), }) formats.append(fmt) self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, 'duration': duration, 'formats': formats, 'is_live': is_live, } ================================================ FILE: youtube_dl/extractor/ruutu.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, ExtractorError, find_xpath_attr, int_or_none, unified_strdate, url_or_none, xpath_attr, xpath_text, ) class RuutuIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= ) (?P\d+) ''' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', 'md5': 'ab2093f39be1ca8581963451b3c0234f', 'info_dict': { 'id': '2058907', 'ext': 'mp4', 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 114, 'age_limit': 0, }, }, { 'url': 'http://www.ruutu.fi/video/2057306', 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', 'info_dict': { 'id': '2057306', 'ext': 'mp4', 'title': 'Superpesis: katso koko kausi Ruudussa', 'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, }, }, { 'url': 'http://www.supla.fi/supla/2231370', 'md5': 'df14e782d49a2c0df03d3be2a54ef949', 'info_dict': { 'id': '2231370', 'ext': 'mp4', 'title': 'Osa 1: Mikael Jungner', 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, }, # Episode where is "NOT-USED", but has other # downloadable sources available. { 'url': 'http://www.ruutu.fi/video/3193728', 'only_matching': True, }, { # audio podcast 'url': 'https://www.supla.fi/supla/3382410', 'md5': 'b9d7155fed37b2ebf6021d74c4b8e908', 'info_dict': { 'id': '3382410', 'ext': 'mp3', 'title': 'Mikä ihmeen poltergeist?', 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, 'expected_warnings': [ 'HTTP Error 502: Bad Gateway', 'Failed to download m3u8 information', ], }, { 'url': 'http://www.supla.fi/audio/2231370', 'only_matching': True, }, { 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', 'only_matching': True, }, { # episode 'url': 'https://www.ruutu.fi/video/3401964', 'info_dict': { 'id': '3401964', 'ext': 'mp4', 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2582, 'age_limit': 12, 'upload_date': '20190508', 'series': 'Temptation Island Suomi', 'season_number': 5, 'episode_number': 17, 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], }, 'params': { 'skip_download': True, }, }, { # premium 'url': 'https://www.ruutu.fi/video/3618715', 'only_matching': True, }, ] _API_BASE = 'https://gatling.nelonenmedia.fi' def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( '%s/media-xml-cache' % self._API_BASE, video_id, query={'id': video_id}) formats = [] processed_urls = [] def extract_formats(node): for child in node: if child.tag.endswith('Files'): extract_formats(child) elif child.tag.endswith('File'): video_url = child.text if (not video_url or video_url in processed_urls or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): continue processed_urls.append(video_url) ext = determine_ext(video_url) auth_video_url = url_or_none(self._download_webpage( '%s/auth/access/v2' % self._API_BASE, video_id, note='Downloading authenticated %s stream URL' % ext, fatal=False, query={'stream': video_url})) if auth_video_url: processed_urls.append(auth_video_url) video_url = auth_video_url if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) elif ext == 'mpd': # video-only and audio-only streams are of different # duration resulting in out of sync issue continue formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) elif ext == 'mp3' or child.tag == 'AudioMediaFile': formats.append({ 'format_id': 'audio', 'url': video_url, 'vcodec': 'none', }) else: proto = compat_urllib_parse_urlparse(video_url).scheme if not child.tag.startswith('HTTP') and proto != 'rtmp': continue preference = -1 if proto == 'rtmp' else 1 label = child.get('label') tbr = int_or_none(child.get('bitrate')) format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto if not self._is_valid_url(video_url, video_id, format_id): continue width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] formats.append({ 'format_id': format_id, 'url': video_url, 'width': width, 'height': height, 'tbr': tbr, 'preference': preference, }) extract_formats(video_xml.find('./Clip')) def pv(name): node = find_xpath_attr( video_xml, './Clip/PassthroughVariables/variable', 'name', name) if node is not None: return node.get('value') if not formats: drm = xpath_text(video_xml, './Clip/DRM', default=None) if drm: raise ExtractorError('This video is DRM protected.', expected=True) ns_st_cds = pv('ns_st_cds') if ns_st_cds != 'free': raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) self._sort_formats(formats) themes = pv('themes') return { 'id': video_id, 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), 'upload_date': unified_strdate(pv('date_start')), 'series': pv('series_name'), 'season_number': int_or_none(pv('season_number')), 'episode_number': int_or_none(pv('episode_number')), 'categories': themes.split(',') if themes else [], 'formats': formats, } ================================================ FILE: youtube_dl/extractor/ruv.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, unified_timestamp, ) class RuvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P[^/]+(?:/\d+)?)' _TESTS = [{ # m3u8 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', 'md5': '66347652f4e13e71936817102acc1724', 'info_dict': { 'id': '1144499', 'display_id': 'fh-valur/20170516', 'ext': 'mp4', 'title': 'FH - Valur', 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', 'timestamp': 1494963600, 'upload_date': '20170516', }, }, { # mp3 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', 'md5': '395ea250c8a13e5fdb39d4670ef85378', 'info_dict': { 'id': '1153630', 'display_id': 'morgunutvarpid/20170619', 'ext': 'mp3', 'title': 'Morgunútvarpið', 'description': 'md5:a4cf1202c0a1645ca096b06525915418', 'timestamp': 1497855000, 'upload_date': '20170619', }, }, { 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', 'only_matching': True, }, { 'url': 'http://www.ruv.is/node/1151854', 'only_matching': True, }, { 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', 'only_matching': True, }, { 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage) FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' media_url = self._html_search_regex( FIELD_RE % 'src', webpage, 'video URL', group='url') video_id = self._search_regex( r']+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', webpage, 'video id', default=display_id) ext = determine_ext(media_url) if ext == 'm3u8': formats = self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') elif ext == 'mp3': formats = [{ 'format_id': 'mp3', 'url': media_url, 'vcodec': 'none', }] else: formats = [{ 'url': media_url, }] description = self._og_search_description(webpage, default=None) thumbnail = self._og_search_thumbnail( webpage, default=None) or self._search_regex( FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) timestamp = unified_timestamp(self._html_search_meta( 'article:published_time', webpage, 'timestamp', fatal=False)) return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/s4c.py ================================================ # coding: utf-8 from __future__ import unicode_literals from functools import partial as partial_f from .common import InfoExtractor from ..utils import ( float_or_none, merge_dicts, T, traverse_obj, txt_or_none, url_or_none, ) class S4CIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P\d+)' _TESTS = [{ 'url': 'https://www.s4c.cymru/clic/programme/861362209', 'info_dict': { 'id': '861362209', 'ext': 'mp4', 'title': 'Y Swn', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', 'duration': 5340, 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg', }, }, { 'url': 'https://www.s4c.cymru/clic/programme/856636948', 'info_dict': { 'id': '856636948', 'ext': 'mp4', 'title': 'Am Dro', 'duration': 2880, 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg', }, }] def _real_extract(self, url): video_id = self._match_id(url) details = self._download_json( 'https://www.s4c.cymru/df/full_prog_details', video_id, query={ 'lang': 'e', 'programme_id': video_id, }, fatal=False) player_config = self._download_json( 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'programme_id': video_id, 'signed': '0', 'lang': 'en', 'mode': 'od', 'appId': 'clic', 'streamName': '', }, note='Downloading player config JSON') m3u8_url = self._download_json( 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'mode': 'od', 'application': 'clic', 'region': 'WW', 'extra': 'false', 'thirdParty': 'false', 'filename': player_config['filename'], }, note='Downloading streaming urls JSON')['hls'] formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native') self._sort_formats(formats) subtitles = {} for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): subtitles.setdefault(sub.get('3', 'en'), []).append({ 'url': sub['0'], 'name': sub.get('1'), }) return merge_dicts({ 'id': video_id, 'formats': formats, 'subtitles': subtitles, 'thumbnail': url_or_none(player_config.get('poster')), }, traverse_obj(details, ('full_prog_details', 0, { 'title': (('programme_title', 'series_title'), T(txt_or_none)), 'description': ('full_billing', T(txt_or_none)), 'duration': ('duration', T(partial_f(float_or_none, invscale=60))), }), get_all=False), rev=True) class S4CSeriesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P\d+)' _TESTS = [{ 'url': 'https://www.s4c.cymru/clic/series/864982911', 'playlist_mincount': 6, 'info_dict': { 'id': '864982911', 'title': 'Iaith ar Daith', }, }, { 'url': 'https://www.s4c.cymru/clic/series/866852587', 'playlist_mincount': 8, 'info_dict': { 'id': '866852587', 'title': 'FFIT Cymru', }, }] def _real_extract(self, url): series_id = self._match_id(url) series_details = self._download_json( 'https://www.s4c.cymru/df/series_details', series_id, query={ 'lang': 'e', 'series_id': series_id, 'show_prog_in_series': 'Y' }, note='Downloading series details JSON') return self.playlist_result( (self.url_result('https://www.s4c.cymru/clic/programme/' + episode_id, S4CIE, episode_id) for episode_id in traverse_obj(series_details, ('other_progs_in_series', Ellipsis, 'id'))), playlist_id=series_id, playlist_title=traverse_obj( series_details, ('full_prog_details', 0, 'series_title', T(txt_or_none)))) ================================================ FILE: youtube_dl/extractor/safari.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, ) from ..utils import ( ExtractorError, update_url_query, ) class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/' _NETRC_MACHINE = 'safari' _API_BASE = 'https://learning.oreilly.com/api/v1' _API_FORMAT = 'json' LOGGED_IN = False def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return _, urlh = self._download_webpage_handle( 'https://learning.oreilly.com/accounts/login-check/', None, 'Downloading login page') def is_logged(urlh): return 'learning.oreilly.com/home/' in urlh.geturl() if is_logged(urlh): self.LOGGED_IN = True return redirect_url = urlh.geturl() parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( 'https://api.oreilly.com', qs['next'][0]) auth, urlh = self._download_json_handle( 'https://www.oreilly.com/member/auth/login/', None, 'Logging in', data=json.dumps({ 'email': username, 'password': password, 'redirect_uri': next_uri, }).encode(), headers={ 'Content-Type': 'application/json', 'Referer': redirect_url, }, expected_status=400) credentials = auth.get('credentials') if (not auth.get('logged_in') and not auth.get('redirect_uri') and credentials): raise ExtractorError( 'Unable to login: %s' % credentials, expected=True) # oreilly serves two same instances of the following cookies # in Set-Cookie header and expects first one to be actually set for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'): self._apply_first_set_cookie_header(urlh, cookie) _, urlh = self._download_webpage_handle( auth.get('redirect_uri') or next_uri, None, 'Completing login',) if is_logged(urlh): self.LOGGED_IN = True return raise ExtractorError('Unable to log in') class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' _VALID_URL = r'''(?x) https?:// (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ (?: library/view/[^/]+/(?P[^/]+)/(?P[^/?\#&]+)\.html| videos/[^/]+/[^/]+/(?P[^-]+-[^/?\#&]+) ) ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', 'md5': 'dcc5a425e79f2564148652616af1f2a3', 'info_dict': { 'id': '0_qbqx90ic', 'ext': 'mp4', 'title': 'Introduction to Hadoop Fundamentals LiveLessons', 'timestamp': 1437758058, 'upload_date': '20150724', 'uploader_id': 'stork', }, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', 'only_matching': True, }, { 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'only_matching': True, }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00', 'only_matching': True, }, { 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro', 'only_matching': True, }, { 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html', 'only_matching': True, }] _PARTNER_ID = '1926081' _UICONF_ID = '29375172' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) reference_id = mobj.group('reference_id') if reference_id: video_id = reference_id partner_id = self._PARTNER_ID ui_id = self._UICONF_ID else: video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part')) webpage, urlh = self._download_webpage_handle(url, video_id) mobj = re.match(self._VALID_URL, urlh.geturl()) reference_id = mobj.group('reference_id') if not reference_id: reference_id = self._search_regex( r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura reference id', group='id') partner_id = self._search_regex( r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura widget id', default=self._PARTNER_ID, group='id') ui_id = self._search_regex( r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura uiconf id', default=self._UICONF_ID, group='id') query = { 'wid': '_%s' % partner_id, 'uiconf_id': ui_id, 'flashvars[referenceId]': reference_id, } if self.LOGGED_IN: kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), video_id, 'Downloading kaltura session JSON', 'Unable to download kaltura session JSON', fatal=False, headers={'Accept': 'application/json'}) if kaltura_session: session = kaltura_session.get('session') if session: query['flashvars[ks]'] = session return self.url_result(update_url_query( 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), 'Kaltura') class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) part = self._download_json( url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), 'Downloading part JSON') return self.url_result(part['web_url'], SafariIE.ie_key()) class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' _VALID_URL = r'''(?x) https?:// (?: (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/ (?: library/view/[^/]+| api/v1/book| videos/[^/]+ )| techbus\.safaribooksonline\.com ) /(?P[^/]+) ''' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', 'info_dict': { 'id': '9780133392838', 'title': 'Hadoop Fundamentals LiveLessons', }, 'playlist_count': 22, 'skip': 'Requires safaribooksonline account credentials', }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', 'only_matching': True, }, { 'url': 'http://techbus.safaribooksonline.com/9780134426365', 'only_matching': True, }, { 'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314', 'only_matching': True, }, { 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838', 'only_matching': True, }, { 'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', 'only_matching': True, }] @classmethod def suitable(cls, url): return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url) else super(SafariCourseIE, cls).suitable(url)) def _real_extract(self, url): course_id = self._match_id(url) course_json = self._download_json( '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), course_id, 'Downloading course JSON') if 'chapters' not in course_json: raise ExtractorError( 'No chapters found for course %s' % course_id, expected=True) entries = [ self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] return self.playlist_result(entries, course_id, course_title) ================================================ FILE: youtube_dl/extractor/samplefocus.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, get_element_by_attribute, int_or_none, ) class SampleFocusIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', 'md5': '48c8d62d60be467293912e0e619a5120', 'info_dict': { 'id': '40316', 'display_id': 'lil-peep-sad-emo-guitar', 'ext': 'mp3', 'title': 'Lil Peep Sad Emo Guitar', 'thumbnail': r're:^https?://.+\.png', 'license': 'Standard License', 'uploader': 'CapsCtrl', 'uploader_id': 'capsctrl', 'like_count': int, 'comment_count': int, 'categories': ['Samples', 'Guitar', 'Electric guitar'], }, }, { 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', 'only_matching': True }, { 'url': 'https://samplefocus.com/samples/young-chop-kick', 'only_matching': True }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', webpage, 'sample id', group='id') title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( r'

(.+?)

', webpage, 'title') mp3_url = self._search_regex( r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( r']+itemprop=(["\'])contentUrl\1[^>]*>', webpage, 'mp3 url', group=0))['content'] thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', webpage, 'mp3', fatal=False, group='url') comments = [] for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

', webpage): comments.append({ 'author': author, 'author_id': author_id, 'text': body, }) uploader_id = uploader = None mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) if mobj: uploader_id, uploader = mobj.groups() breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) categories = [] if breadcrumb: for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): categories.append(name) def extract_count(klass): return int_or_none(self._html_search_regex( r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, webpage, klass, fatal=False)) return { 'id': sample_id, 'title': title, 'url': mp3_url, 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, 'license': self._html_search_regex( r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', webpage, 'license', fatal=False, group='license'), 'uploader_id': uploader_id, 'like_count': extract_count('sample-%s-favorites' % sample_id), 'comment_count': extract_count('comments'), 'comments': comments, 'categories': categories, } ================================================ FILE: youtube_dl/extractor/sapo.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, unified_strdate, ) class SapoIE(InfoExtractor): IE_DESC = 'SAPO Vídeos' _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P[\da-zA-Z]{20})' _TESTS = [ { 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', 'md5': '79ee523f6ecb9233ac25075dee0eda83', 'note': 'SD video', 'info_dict': { 'id': 'UBz95kOtiWYUMTA5Ghfi', 'ext': 'mp4', 'title': 'Benfica - Marcas na Hitória', 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', 'duration': 264, 'uploader': 'tiago_1988', 'upload_date': '20080229', 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], }, }, { 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', 'md5': '90a2f283cfb49193fe06e861613a72aa', 'note': 'HD video', 'info_dict': { 'id': 'IyusNAZ791ZdoCY5H5IF', 'ext': 'mp4', 'title': 'Codebits VII - Report', 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', 'duration': 144, 'uploader': 'codebits', 'upload_date': '20140427', 'categories': ['codebits', 'codebits2014'], }, }, { 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', 'note': 'v2 video', 'info_dict': { 'id': 'yLqjzPtbTimsn2wWBKHz', 'ext': 'mp4', 'title': 'Hipnose Condicionativa 4', 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', 'duration': 692, 'uploader': 'sapozen', 'upload_date': '20090609', 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], }, }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') item = self._download_xml( 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') title = item.find('./title').text description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text upload_date = unified_strdate(item.find('./pubDate').text) view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text categories = tags.split() if tags else [] age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') formats = [{ 'url': video_url, 'ext': 'mp4', 'format_id': 'sd', 'width': int(video_size[0]), 'height': int(video_size[1]), }] if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': formats.append({ 'url': re.sub(r'/mov/1$', '/mov/39', video_url), 'ext': 'mp4', 'format_id': 'hd', 'width': 1280, 'height': 720, }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, 'age_limit': age_limit, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/savefrom.py ================================================ # coding: utf-8 from __future__ import unicode_literals import os.path import re from .common import InfoExtractor class SaveFromIE(InfoExtractor): IE_NAME = 'savefrom.net' _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P.*)$' _TEST = { 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com', 'info_dict': { 'id': 'UlVRAPW2WJY', 'ext': 'mp4', 'title': 'About Team Radical MMA | MMA Fighting', 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', 'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*', }, 'params': { 'skip_download': True } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = os.path.splitext(url.split('/')[-1])[0] return self.url_result(mobj.group('url'), video_id=video_id) ================================================ FILE: youtube_dl/extractor/sbs.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( smuggle_url, ExtractorError, ) class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { 'id': '_rFBPRPO4pMR', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': r're:http://.*\.jpg', 'duration': 308, 'timestamp': 1408613220, 'upload_date': '20140821', 'uploader': 'SBSC', }, }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, }, { 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', 'only_matching': True, }, { 'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723', 'only_matching': True, }, { 'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842', 'only_matching': True, }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, }, { 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) player_params = self._download_json( 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) error = player_params.get('error') if error: error_message = 'Sorry, The video you are looking for does not exist.' video_data = error.get('results') or {} error_code = error.get('errorCode') if error_code == 'ComingSoon': error_message = '%s is not yet available.' % video_data.get('title', '') elif error_code in ('Forbidden', 'intranetAccessOnly'): error_message = 'Sorry, This video cannot be accessed via this website' elif error_code == 'Expired': error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) urls = player_params['releaseUrls'] theplatform_url = (urls.get('progressive') or urls.get('html') or urls.get('standard') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'id': video_id, 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), } ================================================ FILE: youtube_dl/extractor/screencast.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_request, ) from ..utils import ( ExtractorError, ) class ScreencastIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', 'info_dict': { 'id': '3ZEjQXlT', 'ext': 'm4v', 'title': 'Color Measurement with Ocean Optics Spectrometers', 'description': 'md5:240369cde69d8bed61349a199c5fb153', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/V2uXehPJa1ZI', 'md5': 'e8e4b375a7660a9e7e35c33973410d34', 'info_dict': { 'id': 'V2uXehPJa1ZI', 'ext': 'mov', 'title': 'The Amadeus Spectrometer', 'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/aAB3iowa', 'md5': 'dedb2734ed00c9755761ccaee88527cd', 'info_dict': { 'id': 'aAB3iowa', 'ext': 'mp4', 'title': 'Google Earth Export', 'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://www.screencast.com/t/X3ddTrYh', 'md5': '669ee55ff9c51988b4ebc0877cc8b159', 'info_dict': { 'id': 'X3ddTrYh', 'ext': 'wmv', 'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression', 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', } }, { 'url': 'http://screencast.com/t/aAB3iowa', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'(?:(?!\1).)+)\1', webpage, 'video url', default=None, group='url') if video_url is None: video_url = self._html_search_meta( 'og:video', webpage, default=None) if video_url is None: raise ExtractorError('Cannot find video') title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( [r'Title: ([^<]+)
', r'class="tabSeperator">>(.+?)<', r'([^<]+)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) if description is None: description = self._html_search_meta('description', webpage) return { 'id': video_id, 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/screencastomatic.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( get_element_by_class, int_or_none, remove_start, strip_or_none, unified_strdate, ) class ScreencastOMaticIE(InfoExtractor): _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { 'id': 'c2lD3BeOPl', 'ext': 'mp4', 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', 'duration': 369, 'upload_date': '20141216', } }, { 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', 'only_matching': True, }, { 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'https://screencast-o-matic.com/player/' + video_id, video_id) info = self._parse_html5_media_entries(url, webpage, video_id)[0] info.update({ 'id': video_id, 'title': get_element_by_class('overlayTitle', webpage), 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, 'duration': int_or_none(self._search_regex( r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', webpage, 'duration', default=None)), 'upload_date': unified_strdate(remove_start( get_element_by_class('overlayPublished', webpage), 'Published: ')), }) return info ================================================ FILE: youtube_dl/extractor/scrippsnetworks.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import hashlib import re from .aws import AWSIE from .anvato import AnvatoIE from .common import InfoExtractor from ..utils import ( smuggle_url, urlencode_postdata, xpath_text, ) class ScrippsNetworksWatchIE(AWSIE): IE_NAME = 'scrippsnetworks:watch' _VALID_URL = r'''(?x) https?:// watch\. (?Pgeniuskitchen)\.com/ (?: player\.[A-Z0-9]+\.html\#| show/(?:[^/]+/){2}| player/ ) (?P\d+) ''' _TESTS = [{ 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/', 'info_dict': { 'id': '4194875', 'ext': 'mp4', 'title': 'Ample Hills Ice Cream Bike', 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.', 'uploader': 'ANV', 'upload_date': '20171011', 'timestamp': 1507698000, }, 'params': { 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], }] _SNI_TABLE = { 'geniuskitchen': 'genius', } _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1' _AWS_PROXY_HOST = 'web.api.video.snidigital.com' _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site', 'id') aws_identity_id_json = json.dumps({ 'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION }).encode('utf-8') token = self._download_json( 'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id, data=aws_identity_id_json, headers={ 'Accept': '*/*', 'Content-Type': 'application/x-amz-json-1.1', 'Referer': url, 'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(), 'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken', 'X-Amz-User-Agent': self._AWS_USER_AGENT, })['Token'] sts = self._download_xml( 'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({ 'Action': 'AssumeRoleWithWebIdentity', 'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role', 'RoleSessionName': 'web-identity', 'Version': '2011-06-15', 'WebIdentityToken': token, }), headers={ 'Referer': url, 'X-Amz-User-Agent': self._AWS_USER_AGENT, 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', }) def get(key): return xpath_text( sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key, fatal=True) mcp_id = self._aws_execute_api({ 'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id), 'access_key': get('AccessKeyId'), 'secret_key': get('SecretAccessKey'), 'session_token': get('SessionToken'), }, video_id)['results'][0]['mcpId'] return self.url_result( smuggle_url( 'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id, {'geo_countries': ['US']}), AnvatoIE.ie_key(), video_id=mcp_id) class ScrippsNetworksIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?Pcookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P\d+)' _TESTS = [{ 'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338', 'info_dict': { 'id': '0260338', 'ext': 'mp4', 'title': 'The Best of the Best', 'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.', 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', }, 'add_ie': ['ThePlatform'], }, { 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', 'only_matching': True, }, { 'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591', 'only_matching': True, }, { 'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929', 'only_matching': True, }, { 'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184', 'only_matching': True, }, { 'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368', 'only_matching': True, }] _ACCOUNT_MAP = { 'cookingchanneltv': 2433005105, 'discovery': 2706091867, 'diynetwork': 2433004575, 'foodnetwork': 2433005105, 'hgtv': 2433004575, 'travelchannel': 2433005739, } _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true' def _real_extract(self, url): site, guid = re.match(self._VALID_URL, url).groups() return self.url_result(smuggle_url( self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid), {'force_smil_url': True}), 'ThePlatform', guid) ================================================ FILE: youtube_dl/extractor/scte.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( decode_packed_codes, ExtractorError, urlencode_postdata, ) class SCTEBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' _NETRC_MACHINE = 'scte' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') def is_logged(webpage): return any(re.search(p, webpage) for p in ( r'class=["\']welcome\b', r'>Sign Out<')) # already logged in if is_logged(login_popup): return login_form = self._hidden_inputs(login_popup) login_form.update({ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username, 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password, 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on', }) response = self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form)) if '|pageRedirect|' not in response and not is_logged(response): error = self._html_search_regex( r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)\d+)' _TESTS = [{ 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', 'info_dict': { 'title': 'Introduction to DOCSIS Engineering Professional', 'id': '31484', }, 'playlist_count': 5, 'skip': 'Requires account credentials', }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._search_regex(r'

(.+?)

', webpage, 'title') context_id = self._search_regex(r'context-(\d+)', webpage, video_id) content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id context = decode_packed_codes(self._download_webpage( '%smobile/data.js' % content_base, video_id)) data = self._parse_xml( self._search_regex( r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"), video_id) entries = [] for asset in data.findall('.//asset'): asset_url = asset.get('url') if not asset_url or not asset_url.endswith('.mp4'): continue asset_id = self._search_regex( r'video_([^_]+)_', asset_url, 'asset id', default=None) if not asset_id: continue entries.append({ 'id': asset_id, 'title': title, 'url': content_base + asset_url, }) return self.playlist_result(entries, video_id, title) class SCTECourseIE(SCTEBaseIE): _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P\d+)' _TESTS = [{ 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', 'only_matching': True, }, { 'url': 'https://learning.scte.org/course/view.php?id=3639', 'only_matching': True, }, { 'url': 'https://learning.scte.org/course/view.php?id=3073', 'only_matching': True, }] def _real_extract(self, url): course_id = self._match_id(url) webpage = self._download_webpage(url, course_id) title = self._search_regex( r'

(.+?)

', webpage, 'title', default=None) entries = [] for mobj in re.finditer( r'''(?x) ]+ href=(["\']) (?P https?://learning\.scte\.org/mod/ (?Pscorm|subcourse)/view\.php?(?:(?!\1).)*? \bid=\d+ ) ''', webpage): item_url = mobj.group('url') if item_url == url: continue ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm' else SCTECourseIE.ie_key()) entries.append(self.url_result(item_url, ie=ie)) return self.playlist_result(entries, course_id, title) ================================================ FILE: youtube_dl/extractor/seeker.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( get_element_by_class, strip_or_none, ) class SeekerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P.*)-(?P\d+)\.html' _TESTS = [{ 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', 'md5': '897d44bbe0d8986a2ead96de565a92db', 'info_dict': { 'id': 'Elrn3gnY', 'ext': 'mp4', 'title': 'Should Trump Be Required To Release His Tax Returns?', 'description': 'md5:41efa8cfa8d627841045eec7b018eb45', 'timestamp': 1490090165, 'upload_date': '20170321', } }, { 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', 'playlist': [ { 'md5': '0497b9f20495174be73ae136949707d2', 'info_dict': { 'id': 'FihYQ8AE', 'ext': 'mp4', 'title': 'The Pros & Cons Of Zoos', 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c', 'timestamp': 1490039133, 'upload_date': '20170320', }, } ], 'info_dict': { 'id': '1834116536', 'title': 'After Gorilla Killing, Changes Ahead for Zoos', 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', }, }] def _real_extract(self, url): display_id, article_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) entries = [] for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage): entries.append(self.url_result( 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id)) return self.playlist_result( entries, article_id, self._og_search_title(webpage), strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage)) ================================================ FILE: youtube_dl/extractor/senateisvp.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, unsmuggle_url, ) from ..compat import ( compat_parse_qs, compat_urlparse, ) class SenateISVPIE(InfoExtractor): _COMM_MAP = [ ['ag', '76440', 'http://ag-f.akamaihd.net'], ['aging', '76442', 'http://aging-f.akamaihd.net'], ['approps', '76441', 'http://approps-f.akamaihd.net'], ['armed', '76445', 'http://armed-f.akamaihd.net'], ['banking', '76446', 'http://banking-f.akamaihd.net'], ['budget', '76447', 'http://budget-f.akamaihd.net'], ['cecc', '76486', 'http://srs-f.akamaihd.net'], ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], ['csce', '75229', 'http://srs-f.akamaihd.net'], ['dpc', '76590', 'http://dpc-f.akamaihd.net'], ['energy', '76448', 'http://energy-f.akamaihd.net'], ['epw', '76478', 'http://epw-f.akamaihd.net'], ['ethics', '76449', 'http://ethics-f.akamaihd.net'], ['finance', '76450', 'http://finance-f.akamaihd.net'], ['foreign', '76451', 'http://foreign-f.akamaihd.net'], ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], ['help', '76452', 'http://help-f.akamaihd.net'], ['indian', '76455', 'http://indian-f.akamaihd.net'], ['intel', '76456', 'http://intel-f.akamaihd.net'], ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], ['jccic', '85180', 'http://jccic-f.akamaihd.net'], ['jec', '76458', 'http://jec-f.akamaihd.net'], ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], ['rpc', '76591', 'http://rpc-f.akamaihd.net'], ['rules', '76460', 'http://rules-f.akamaihd.net'], ['saa', '76489', 'http://srs-f.akamaihd.net'], ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], ['srs', '75229', 'http://srs-f.akamaihd.net'], ['uscc', '76487', 'http://srs-f.akamaihd.net'], ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] IE_NAME = 'senate.gov' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { 'id': 'judiciary031715', 'ext': 'mp4', 'title': 'Integrated Senate Video Player', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', 'info_dict': { 'id': 'commerce011514', 'ext': 'mp4', 'title': 'Integrated Senate Video Player' }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', # checksum differs each time 'info_dict': { 'id': 'intel090613', 'ext': 'mp4', 'title': 'Integrated Senate Video Player' } }, { # From http://www.c-span.org/video/?96791-1 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', 'only_matching': True, }] @staticmethod def _search_iframe_url(webpage): mobj = re.search( r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') def _get_info_for_comm(self, committee): for entry in self._COMM_MAP: if entry[0] == committee: return entry[1:] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): raise ExtractorError('Invalid URL', expected=True) video_id = re.sub(r'.mp4$', '', qs['filename'][0]) webpage = self._download_webpage(url, video_id) if smuggled_data.get('force_title'): title = smuggled_data['force_title'] else: title = self._html_search_regex(r'([^<]+)', webpage, video_id) poster = qs.get('poster') thumbnail = poster[0] if poster else None video_type = qs['type'][0] committee = video_type if video_type == 'arch' else qs['comm'][0] stream_num, domain = self._get_info_for_comm(committee) formats = [] if video_type == 'arch': filename = video_id if '.' in video_id else video_id + '.mp4' formats = [{ # All parameters in the query string are necessary to prevent a 403 error 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', }] else: hdcore_sign = 'hdcore=3.1.0' url_params = (domain, video_id, stream_num) f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): # URLs without the extra param induce an 404 error entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.append(entry) for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) if mobj: entry['format_id'] += mobj.group('tag') formats.append(entry) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, } ================================================ FILE: youtube_dl/extractor/sendtonews.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( float_or_none, parse_iso8601, update_url_query, int_or_none, determine_protocol, unescapeHTML, ) class SendtoNewsIE(InfoExtractor): _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588' }, 'playlist_count': 8, # test the first video only to prevent lengthy tests 'playlist': [{ 'info_dict': { 'id': '240385', 'ext': 'mp4', 'title': 'Indians introduce Encarnacion', 'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland', 'duration': 137.898, 'thumbnail': r're:https?://.*\.jpg$', 'upload_date': '20170105', 'timestamp': 1483649762, }, }], 'params': { # m3u8 download 'skip_download': True, }, } _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod def _extract_url(cls, webpage): mobj = re.search(r'''(?x)]+src=([\'"]) (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? .*\bSC=(?P[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: sc = mobj.group('SC') return cls._URL_TEMPLATE % sc def _real_extract(self, url): playlist_id = self._match_id(url) data_url = update_url_query( url.replace('embedplayer.php', 'data_read.php'), {'cmd': 'loadInitial'}) playlist_data = self._download_json(data_url, playlist_id) entries = [] for video in playlist_data['playlistData'][0]: info_dict = self._parse_jwplayer_data( video['jwconfiguration'], require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True}) for f in info_dict['formats']: if f.get('tbr'): continue tbr = int_or_none(self._search_regex( r'/(\d+)k/', f['url'], 'bitrate', default=None)) if not tbr: continue f.update({ 'format_id': '%s-%d' % (determine_protocol(f), tbr), 'tbr': tbr, }) self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id')) thumbnails = [] if video.get('thumbnailUrl'): thumbnails.append({ 'id': 'normal', 'url': video['thumbnailUrl'], }) if video.get('smThumbnailUrl'): thumbnails.append({ 'id': 'small', 'url': video['smThumbnailUrl'], }) info_dict.update({ 'title': video['S_headLine'].strip(), 'description': unescapeHTML(video.get('S_fullStory')), 'thumbnails': thumbnails, 'duration': float_or_none(video.get('SM_length')), 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), }) entries.append(info_dict) return self.playlist_result(entries, playlist_id) ================================================ FILE: youtube_dl/extractor/servus.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, float_or_none, int_or_none, unified_timestamp, urlencode_postdata, url_or_none, ) class ServusIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| (?:servustv|pm-wissen)\.com/videos ) /(?P[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 62.442, 'timestamp': 1605193976, 'upload_date': '20201112', 'series': 'Talk im Hangar-7', 'season': 'Season 9', 'season_number': 9, 'episode': 'Episode 31 - September 14', 'episode_number': 31, } }, { # old URL schema 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', 'only_matching': True, }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, }, { 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', 'only_matching': True, }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, }, { 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() token = self._download_json( 'https://auth.redbullmediahouse.com/token', video_id, 'Downloading token', data=urlencode_postdata({ 'grant_type': 'client_credentials', }), headers={ 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', }) access_token = token['access_token'] token_type = token.get('token_type', 'Bearer') video = self._download_json( 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, video_id, 'Downloading video JSON', headers={ 'Authorization': '%s %s' % (token_type, access_token), }) formats = [] thumbnail = None for resource in video['resources']: if not isinstance(resource, dict): continue format_url = url_or_none(resource.get('url')) if not format_url: continue extension = resource.get('extension') type_ = resource.get('type') if extension == 'jpg' or type_ == 'reference_keyframe': thumbnail = format_url continue ext = determine_ext(format_url) if type_ == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( format_url, video_id, mpd_id='dash', fatal=False)) elif type_ == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) elif extension == 'mp4' or ext == 'mp4': formats.append({ 'url': format_url, 'format_id': type_, 'width': int_or_none(resource.get('width')), 'height': int_or_none(resource.get('height')), }) self._sort_formats(formats) attrs = {} for attribute in video['attributes']: if not isinstance(attribute, dict): continue key = attribute.get('fieldKey') value = attribute.get('fieldValue') if not key or not value: continue attrs[key] = value title = attrs.get('title_stv') or video_id alt_title = attrs.get('title') description = attrs.get('long_description') or attrs.get('short_description') series = attrs.get('label') season = attrs.get('season') episode = attrs.get('chapter') duration = float_or_none(attrs.get('duration'), scale=1000) season_number = int_or_none(self._search_regex( r'Season (\d+)', season or '', 'season number', default=None)) episode_number = int_or_none(self._search_regex( r'Episode (\d+)', episode or '', 'episode number', default=None)) return { 'id': video_id, 'title': title, 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'timestamp': unified_timestamp(video.get('lastPublished')), 'series': series, 'season': season, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/sevenplus.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( ExtractorError, try_get, update_url_query, ) class SevenPlusIE(BrightcoveNewIE): IE_NAME = '7plus' _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P[^?]+\?.*?\bepisode-id=(?P[^&#]+))' _TESTS = [{ 'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003', 'info_dict': { 'id': 'MTYS7-003', 'ext': 'mp4', 'title': 'S7 E3 - Wind Surf', 'description': 'md5:29c6a69f21accda7601278f81b46483d', 'uploader_id': '5303576322001', 'upload_date': '20171201', 'timestamp': 1512106377, 'series': 'Mighty Ships', 'season_number': 7, 'episode_number': 3, 'episode': 'Wind Surf', }, 'params': { 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001', 'only_matching': True, }] def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() try: media = self._download_json( 'https://videoservice.swm.digital/playback', episode_id, query={ 'appId': '7plus', 'deviceType': 'web', 'platformType': 'web', 'accountId': 5303576322001, 'referenceId': 'ref:' + episode_id, 'deliveryId': 'csai', 'videoType': 'vod', })['media'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: raise ExtractorError(self._parse_json( e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) raise for source in media.get('sources', {}): src = source.get('src') if not src: continue source['src'] = update_url_query(src, {'rule': ''}) info = self._parse_brightcove_metadata(media, episode_id) content = self._download_json( 'https://component-cdn.swm.digital/content/' + path, episode_id, headers={ 'market-id': 4, }, fatal=False) or {} for item in content.get('items', {}): if item.get('componentData', {}).get('componentType') == 'infoPanel': for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]: value = item.get(src_key) if value: info[dst_key] = value info['series'] = try_get( item, lambda x: x['seriesLogo']['name'], compat_str) mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title']) if mobj: info.update({ 'season_number': int(mobj.group(1)), 'episode_number': int(mobj.group(2)), 'episode': mobj.group(3), }) return info ================================================ FILE: youtube_dl/extractor/sexu.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor class SexuIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P\d+)' _TEST = { 'url': 'http://sexu.com/961791/', 'md5': 'ff615aca9691053c94f8f10d96cd7884', 'info_dict': { 'id': '961791', 'ext': 'mp4', 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403', 'categories': list, # NSFW 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) jwvideo = self._parse_json( self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), video_id) sources = jwvideo['sources'] formats = [{ 'url': source['file'].replace('\\', ''), 'format_id': source.get('label'), 'height': int(self._search_regex( r'^(\d+)[pP]', source.get('label', ''), 'height', default=None)), } for source in sources if source.get('file')] self._sort_formats(formats) title = self._html_search_regex( r'([^<]+)\s*-\s*Sexu\.Com', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = jwvideo.get('image') categories_str = self._html_search_meta( 'keywords', webpage, 'categories') categories = ( None if categories_str is None else categories_str.split(',')) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, 'age_limit': 18, } ================================================ FILE: youtube_dl/extractor/seznamzpravy.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( urljoin, int_or_none, parse_codecs, try_get, ) def _raw_id(src_url): return compat_urllib_parse_urlparse(src_url).path.split('/')[-1] class SeznamZpravyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' _TESTS = [{ 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', 'info_dict': { 'id': '170889', 'ext': 'mp4', 'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)', 'thumbnail': r're:^https?://.*\.jpe?g', 'duration': 241, 'series': 'Svět bez obalu', }, 'params': { 'skip_download': True, }, }, { # with Location key 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5§ionPrefixPostroll=%2Fzpravy%2Fvyzva®ression=false', 'info_dict': { 'id': '185688', 'ext': 'mp4', 'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu', 'thumbnail': r're:^https?://.*\.jpe?g', 'series': 'Výzva', }, 'params': { 'skip_download': True, }, }] @staticmethod def _extract_urls(webpage): return [ mobj.group('url') for mobj in re.finditer( r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', webpage)] def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) if sdn_data.get('Location'): sdn_url = sdn_data['Location'] sdn_data = self._download_json(sdn_url, video_id) formats = [] mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {} for format_id, format_data in mp4_formats.items(): relative_url = format_data.get('url') if not relative_url: continue try: width, height = format_data.get('resolution') except (TypeError, ValueError): width, height = None, None f = { 'url': urljoin(sdn_url, relative_url), 'format_id': 'http-%s' % format_id, 'tbr': int_or_none(format_data.get('bandwidth'), scale=1000), 'width': int_or_none(width), 'height': int_or_none(height), } f.update(parse_codecs(format_data.get('codec'))) formats.append(f) pls = sdn_data.get('pls', {}) def get_url(format_id): return try_get(pls, lambda x: x[format_id]['url'], compat_str) dash_rel_url = get_url('dash') if dash_rel_url: formats.extend(self._extract_mpd_formats( urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash', fatal=False)) hls_rel_url = get_url('hls') if hls_rel_url: formats.extend(self._extract_m3u8_formats( urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return formats def _real_extract(self, url): params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) src = params['src'][0] title = params['title'][0] video_id = params.get('contentId', [_raw_id(src)])[0] formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id) duration = int_or_none(params.get('duration', [None])[0]) series = params.get('series', [None])[0] thumbnail = params.get('poster', [None])[0] return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'duration': duration, 'series': series, 'formats': formats, } class SeznamZpravyArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P\d+)' _API_URL = 'https://apizpravy.seznam.cz/' _TESTS = [{ # two videos on one page, with SDN URL 'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990', 'info_dict': { 'id': '35990', 'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2', 'description': 'md5:933f7b06fa337a814ba199d3596d27ba', }, 'playlist_count': 2, }, { # video with live stream URL 'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489', 'info_dict': { 'id': '38489', 'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60', 'description': 'md5:428e7926a1a81986ec7eb23078004fb4', }, 'playlist_count': 1, }] def _real_extract(self, url): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) info = self._search_json_ld(webpage, article_id, default={}) title = info.get('title') or self._og_search_title(webpage, fatal=False) description = info.get('description') or self._og_search_description(webpage) return self.playlist_result([ self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) for entry_url in SeznamZpravyIE._extract_urls(webpage)], article_id, title, description) ================================================ FILE: youtube_dl/extractor/shahid.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import math import re from .aws import AWSIE from ..compat import compat_HTTPError from ..utils import ( clean_html, ExtractorError, InAdvancePagedList, int_or_none, parse_iso8601, str_or_none, urlencode_postdata, ) class ShahidBaseIE(AWSIE): _AWS_PROXY_HOST = 'api2.shahid.net' _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh' _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/' def _handle_error(self, e): fail_data = self._parse_json( e.cause.read().decode('utf-8'), None, fatal=False) if fail_data: faults = fail_data.get('faults', []) faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) if faults_message: raise ExtractorError(faults_message, expected=True) def _call_api(self, path, video_id, request=None): query = {} if request: query['request'] = json.dumps(request) try: return self._aws_execute_api({ 'uri': '/proxy/v2/' + path, 'access_key': 'AKIAI6X4TYCIXM2B7MUQ', 'secret_key': '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', }, video_id, query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): self._handle_error(e) raise class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { 'id': '816924', 'ext': 'mp4', 'title': 'متحف الدحيح الموسم 1 كليب 1', 'timestamp': 1602806400, 'upload_date': '20201016', 'description': 'برومو', 'duration': 22, 'categories': ['كوميديا'], }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746', 'only_matching': True }, { # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511', 'only_matching': True }, { 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319', 'only_matching': True }] def _real_initialize(self): email, password = self._get_login_info() if email is None: return try: user_data = self._download_json( 'https://shahid.mbc.net/wd/service/users/login', None, 'Logging in', data=json.dumps({ 'email': email, 'password': password, 'basic': 'false', }).encode('utf-8'), headers={ 'Content-Type': 'application/json; charset=UTF-8', })['user'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): self._handle_error(e) raise self._download_webpage( 'https://shahid.mbc.net/populateContext', None, 'Populate Context', data=urlencode_postdata({ 'firstName': user_data['firstName'], 'lastName': user_data['lastName'], 'userName': user_data['email'], 'csg_user_name': user_data['email'], 'subscriberId': user_data['id'], 'sessionId': user_data['sessionId'], })) def _real_extract(self, url): page_type, video_id = re.match(self._VALID_URL, url).groups() if page_type == 'clip': page_type = 'episode' playout = self._call_api( 'playout/new/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) formats = self._extract_m3u8_formats(re.sub( # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html r'aws\.manifestfilter=[\w:;,-]+&?', '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( # 'product/id', video_id, { # 'id': video_id, # 'productType': 'ASSET', # 'productSubType': page_type.upper() # })['productModel'] response = self._download_json( 'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id), video_id, 'Downloading video JSON', query={ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', }) data = response.get('data', {}) error = data.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), expected=True) video = data[page_type] title = video['title'] categories = [ category['name'] for category in video.get('genres', []) if 'name' in category] return { 'id': video_id, 'title': title, 'description': video.get('description'), 'thumbnail': video.get('thumbnailUrl'), 'duration': int_or_none(video.get('duration')), 'timestamp': parse_iso8601(video.get('referenceDate')), 'categories': categories, 'series': video.get('showTitle') or video.get('showName'), 'season': video.get('seasonTitle'), 'season_number': int_or_none(video.get('seasonNumber')), 'season_id': str_or_none(video.get('seasonId')), 'episode_number': int_or_none(video.get('number')), 'episode_id': video_id, 'formats': formats, } class ShahidShowIE(ShahidBaseIE): _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P\d+)' _TESTS = [{ 'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187', 'info_dict': { 'id': '79187', 'title': 'رامز قرش البحر', 'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff', }, 'playlist_mincount': 32, }, { 'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861', 'only_matching': True }] _PAGE_SIZE = 30 def _real_extract(self, url): show_id = self._match_id(url) product = self._call_api( 'playableAsset', show_id, {'showId': show_id})['productModel'] playlist = product['playlist'] playlist_id = playlist['id'] show = product.get('show', {}) def page_func(page_num): playlist = self._call_api( 'product/playlist', show_id, { 'playListId': playlist_id, 'pageNumber': page_num, 'pageSize': 30, 'sorts': [{ 'order': 'DESC', 'type': 'SORTDATE' }], }) for product in playlist.get('productList', {}).get('products', []): product_url = product.get('productUrl', []).get('url') if not product_url: continue yield self.url_result( product_url, 'Shahid', str_or_none(product.get('id')), product.get('title')) entries = InAdvancePagedList( page_func, math.ceil(playlist['count'] / self._PAGE_SIZE), self._PAGE_SIZE) return self.playlist_result( entries, show_id, show.get('title'), show.get('description')) ================================================ FILE: youtube_dl/extractor/shared.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_urllib_parse_unquote_plus, ) from ..utils import ( determine_ext, ExtractorError, int_or_none, js_to_json, KNOWN_EXTENSIONS, parse_filesize, rot47, url_or_none, urlencode_postdata, ) class SharedBaseIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, video_id) if self._FILE_NOT_FOUND in webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) video_url = self._extract_video_url(webpage, video_id, url) title = self._extract_title(webpage) filesize = int_or_none(self._extract_filesize(webpage)) return { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'filesize': filesize, 'title': title, } def _extract_title(self, webpage): return compat_b64decode(self._html_search_meta( 'full:title', webpage, 'title')).decode('utf-8') def _extract_filesize(self, webpage): return self._html_search_meta( 'full:size', webpage, 'file size', fatal=False) class SharedIE(SharedBaseIE): IE_DESC = 'shared.sx' _VALID_URL = r'https?://shared\.sx/(?P[\da-z]{10})' _FILE_NOT_FOUND = '>File does not exist<' _TEST = { 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { 'id': '0060718775', 'ext': 'mp4', 'title': 'Bmp4', 'filesize': 1720110, }, } def _extract_video_url(self, webpage, video_id, url): download_form = self._hidden_inputs(webpage) video_page = self._download_webpage( url, video_id, 'Downloading video page', data=urlencode_postdata(download_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': url, }) video_url = self._html_search_regex( r'data-url=(["\'])(?P(?:(?!\1).)+)\1', video_page, 'video URL', group='url') return video_url class VivoIE(SharedBaseIE): IE_DESC = 'vivo.sx' _VALID_URL = r'https?://vivo\.s[xt]/(?P[\da-z]{10})' _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' _TESTS = [{ 'url': 'http://vivo.sx/d7ddda0e78', 'md5': '15b3af41be0b4fe01f4df075c2678b2c', 'info_dict': { 'id': 'd7ddda0e78', 'ext': 'mp4', 'title': 'Chicken', 'filesize': 515659, }, }, { 'url': 'http://vivo.st/d7ddda0e78', 'only_matching': True, }] def _extract_title(self, webpage): title = self._html_search_regex( r'data-name\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'title', default=None, group='title') if title: ext = determine_ext(title) if ext.lower() in KNOWN_EXTENSIONS: title = title.rpartition('.' + ext)[0] return title return self._og_search_title(webpage) def _extract_filesize(self, webpage): return parse_filesize(self._search_regex( r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', webpage, 'filesize', fatal=False)) def _extract_video_url(self, webpage, video_id, url): def decode_url_old(encoded_url): return compat_b64decode(encoded_url).decode('utf-8') stream_url = self._search_regex( r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'stream url', default=None, group='url') if stream_url: stream_url = url_or_none(decode_url_old(stream_url)) if stream_url: return stream_url def decode_url(encoded_url): return rot47(compat_urllib_parse_unquote_plus(encoded_url)) return decode_url(self._parse_json( self._search_regex( r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, 'stream'), video_id, transform_source=js_to_json)['source']) ================================================ FILE: youtube_dl/extractor/showroomlive.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, urljoin, ) class ShowRoomLiveIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.showroom-live.com/48_Nana_Okada', 'only_matching': True, } def _real_extract(self, url): broadcaster_id = self._match_id(url) webpage = self._download_webpage(url, broadcaster_id) room_id = self._search_regex( (r'SrGlobal\.roomId\s*=\s*(\d+)', r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id') room = self._download_json( urljoin(url, '/api/room/profile?room_id=%s' % room_id), broadcaster_id) is_live = room.get('is_onlive') if is_live is not True: raise ExtractorError('%s is offline' % broadcaster_id, expected=True) uploader = room.get('performer_name') or broadcaster_id title = room.get('room_name') or room.get('main_name') or uploader streaming_url_list = self._download_json( urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id), broadcaster_id)['streaming_url_list'] formats = [] for stream in streaming_url_list: stream_url = stream.get('url') if not stream_url: continue stream_type = stream.get('type') if stream_type == 'hls': m3u8_formats = self._extract_m3u8_formats( stream_url, broadcaster_id, ext='mp4', m3u8_id='hls', live=True) for f in m3u8_formats: f['quality'] = int_or_none(stream.get('quality', 100)) formats.extend(m3u8_formats) elif stream_type == 'rtmp': stream_name = stream.get('stream_name') if not stream_name: continue formats.append({ 'url': stream_url, 'play_path': stream_name, 'page_url': url, 'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf', 'rtmp_live': True, 'ext': 'flv', 'format_id': 'rtmp', 'format_note': stream.get('label'), 'quality': int_or_none(stream.get('quality', 100)), }) self._sort_formats(formats) return { 'id': compat_str(room.get('live_id') or broadcaster_id), 'title': self._live_title(title), 'description': room.get('description'), 'timestamp': int_or_none(room.get('current_live_started_at')), 'uploader': uploader, 'uploader_id': broadcaster_id, 'view_count': int_or_none(room.get('view_num')), 'formats': formats, 'is_live': True, } ================================================ FILE: youtube_dl/extractor/simplecast.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( clean_podcast_url, int_or_none, parse_iso8601, strip_or_none, try_get, urlencode_postdata, ) class SimplecastBaseIE(InfoExtractor): _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' _API_BASE = 'https://api.simplecast.com/' def _call_api(self, path_tmpl, video_id): return self._download_json( self._API_BASE + path_tmpl % video_id, video_id) def _call_search_api(self, resource, resource_id, resource_url): return self._download_json( 'https://api.simplecast.com/%ss/search' % resource, resource_id, data=urlencode_postdata({'url': resource_url})) def _parse_episode(self, episode): episode_id = episode['id'] title = episode['title'].strip() audio_file = episode.get('audio_file') or {} audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] season = episode.get('season') or {} season_href = season.get('href') season_id = None if season_href: season_id = self._search_regex( r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, season_href, 'season id', default=None) webpage_url = episode.get('episode_url') channel_url = None if webpage_url: channel_url = self._search_regex( r'(https?://[^/]+\.simplecast\.com)', webpage_url, 'channel url', default=None) return { 'id': episode_id, 'display_id': episode.get('slug'), 'title': title, 'url': clean_podcast_url(audio_file_url), 'webpage_url': webpage_url, 'channel_url': channel_url, 'series': try_get(episode, lambda x: x['podcast']['title']), 'season_number': int_or_none(season.get('number')), 'season_id': season_id, 'thumbnail': episode.get('image_url'), 'episode_id': episode_id, 'episode_number': int_or_none(episode.get('number')), 'description': strip_or_none(episode.get('description')), 'timestamp': parse_iso8601(episode.get('published_at')), 'duration': int_or_none(episode.get('duration')), 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), } class SimplecastIE(SimplecastBaseIE): IE_NAME = 'simplecast' _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX _COMMON_TEST_INFO = { 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', 'ext': 'mp3', 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', 'episode_number': 1, 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', 'season_number': 1, 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', 'series': 'The RE:BIND.io Podcast', 'duration': 5343, 'timestamp': 1580979475, 'upload_date': '20200206', 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', } _TESTS = [{ 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', 'md5': '8c93be7be54251bf29ee97464eabe61c', 'info_dict': _COMMON_TEST_INFO, }, { 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r'''(?x)<iframe[^>]+src=["\'] ( https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| player\.simplecast\.com/%s ))''' % SimplecastBaseIE._UUID_REGEX, webpage) def _real_extract(self, url): episode_id = self._match_id(url) episode = self._call_api('episodes/%s', episode_id) return self._parse_episode(episode) class SimplecastEpisodeIE(SimplecastBaseIE): IE_NAME = 'simplecast:episode' _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)' _TEST = { 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', 'md5': '8c93be7be54251bf29ee97464eabe61c', 'info_dict': SimplecastIE._COMMON_TEST_INFO, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) episode = self._call_search_api( 'episode', mobj.group(1), mobj.group(0)) return self._parse_episode(episode) class SimplecastPodcastIE(SimplecastBaseIE): IE_NAME = 'simplecast:podcast' _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' _TESTS = [{ 'url': 'https://the-re-bind-io-podcast.simplecast.com', 'playlist_mincount': 33, 'info_dict': { 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', 'title': 'The RE:BIND.io Podcast', }, }, { 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', 'only_matching': True, }] def _real_extract(self, url): subdomain = self._match_id(url) site = self._call_search_api('site', subdomain, url) podcast = site['podcast'] podcast_id = podcast['id'] podcast_title = podcast.get('title') def entries(): episodes = self._call_api('podcasts/%s/episodes', podcast_id) for episode in (episodes.get('collection') or []): info = self._parse_episode(episode) info['series'] = podcast_title yield info return self.playlist_result(entries(), podcast_id, podcast_title) ================================================ FILE: youtube_dl/extractor/sina.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( HEADRequest, ExtractorError, int_or_none, update_url_query, qualities, get_element_by_attribute, clean_html, ) class SinaIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ (?: (?:view/|.*\#)(?P<video_id>\d+)| .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| # This is used by external sites like Weibo api/sinawebApi/outplay.php/(?P<token>.+?)\.swf ) ''' _TESTS = [ { 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', 'md5': 'd38433e2fc886007729735650ae4b3e9', 'info_dict': { 'id': '250576622', 'ext': 'mp4', 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', } }, { 'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html', 'info_dict': { 'id': '101314253', 'ext': 'flv', 'title': '军方提高对朝情报监视级别', }, 'skip': 'the page does not exist or has been deleted', }, { 'url': 'http://video.sina.com.cn/view/250587748.html', 'md5': '3d1807a25c775092aab3bc157fff49b4', 'info_dict': { 'id': '250587748', 'ext': 'mp4', 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', }, }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') if not video_id: if mobj.group('token') is not None: # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') webpage = self._download_webpage(url, pseudo_id) error = get_element_by_attribute('class', 'errtitle', webpage) if error: raise ExtractorError('%s said: %s' % ( self.IE_NAME, clean_html(error)), expected=True) video_id = self._search_regex( r"video_id\s*:\s*'(\d+)'", webpage, 'video id') video_data = self._download_json( 'http://s.video.sina.com.cn/video/h5play', video_id, query={'video_id': video_id}) if video_data['code'] != 1: raise ExtractorError('%s said: %s' % ( self.IE_NAME, video_data['message']), expected=True) else: video_data = video_data['data'] title = video_data['title'] description = video_data.get('description') if description: description = description.strip() preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) formats = [] for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): file_api = quality.get('file_api') file_id = quality.get('file_id') if not file_api or not file_id: continue formats.append({ 'format_id': quality_id, 'url': update_url_query(file_api, {'vid': file_id}), 'preference': preference(quality_id), 'ext': 'mp4', }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': video_data.get('image'), 'duration': int_or_none(video_data.get('length')), 'timestamp': int_or_none(video_data.get('create_time')), 'formats': formats, } ================================================ FILE: youtube_dl/extractor/sixplay.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, int_or_none, try_get, qualities, ) class SixPlayIE(InfoExtractor): IE_NAME = '6play' _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051', 'md5': '31fcd112637baa0c2ab92c4fcd8baf27', 'info_dict': { 'id': '12041051', 'ext': 'mp4', 'title': 'Le but qui a marqué l\'histoire du football français !', 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851', }, }, { 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869', 'only_matching': True, }, { 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989', 'only_matching': True, }, { 'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787', 'only_matching': True, }] def _real_extract(self, url): domain, video_id = re.search(self._VALID_URL, url).groups() service, consumer_name = { '6play.fr': ('6play', 'm6web'), 'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'), 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'), 'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'), }.get(domain, ('6play', 'm6web')) data = self._download_json( 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id), video_id, headers={ 'x-customer-name': consumer_name }, query={ 'csa': 5, 'with': 'clips', }) clip_data = data['clips'][0] title = clip_data['title'] urls = [] quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] subtitles = {} assets = clip_data.get('assets') or [] for asset in assets: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls: continue urls.append(asset_url) container = asset.get('video_container') ext = determine_ext(asset_url) if protocol == 'http_subtitle' or ext == 'vtt': subtitles.setdefault('fr', []).append({'url': asset_url}) continue if container == 'm3u8' or ext == 'm3u8': if protocol == 'usp': if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: urlh = self._request_webpage( asset_url, video_id, fatal=False, headers=self.geo_verification_headers()) if not urlh: continue asset_url = urlh.geturl() asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') for i in range(3, 0, -1): asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) m3u8_formats = self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) formats.extend(self._extract_mpd_formats( asset_url.replace('.m3u8', '.mpd'), video_id, mpd_id='dash', fatal=False)) if m3u8_formats: break else: formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif container == 'mp4' or ext == 'mp4': quality = asset.get('video_quality') formats.append({ 'url': asset_url, 'format_id': quality, 'quality': quality_key(quality), 'ext': ext, }) self._sort_formats(formats) def get(getter): for src in (data, clip_data): v = try_get(src, getter, compat_str) if v: return v return { 'id': video_id, 'title': title, 'description': get(lambda x: x['description']), 'duration': int_or_none(clip_data.get('duration')), 'series': get(lambda x: x['program']['title']), 'formats': formats, 'subtitles': subtitles, } ================================================ FILE: youtube_dl/extractor/sky.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, smuggle_url, strip_or_none, urljoin, ) class SkyBaseIE(InfoExtractor): BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' _SDC_EL_REGEX = r'(?s)(<div[^>]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' def _process_ooyala_element(self, webpage, sdc_el, url): sdc = extract_attributes(sdc_el) provider = sdc.get('data-provider') if provider == 'ooyala': video_id = sdc['data-sdc-video-id'] video_url = 'ooyala:%s' % video_id ie_key = 'Ooyala' ooyala_el = self._search_regex( r'(<div[^>]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, webpage, 'video data', fatal=False) if ooyala_el: ooyala_attrs = extract_attributes(ooyala_el) or {} if ooyala_attrs.get('data-token-required') == 'true': token_fetch_url = (self._parse_json(ooyala_attrs.get( 'data-token-fetch-options', '{}'), video_id, fatal=False) or {}).get('url') if token_fetch_url: embed_token = self._download_json(urljoin( url, token_fetch_url), video_id, fatal=False) if embed_token: video_url = smuggle_url( video_url, {'embed_token': embed_token}) elif provider == 'brightcove': video_id = sdc['data-video-id'] account_id = sdc.get('data-account-id') or '6058004172001' player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, 'ie_key': ie_key, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._process_ooyala_element(webpage, self._search_regex( self._SDC_EL_REGEX, webpage, 'sdc element'), url) info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), }) return info class SkySportsIE(SkyBaseIE): IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', 'info_dict': { 'id': 'o3eWJnNDE6l7kfNO8BOoBlRxXRQ4ANNQ', 'ext': 'mp4', 'title': 'Bale: It\'s our time to shine', 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, 'add_ie': ['Ooyala'], }, { 'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook', 'only_matching': True, }, { 'url': 'https://www.skysports.com/watch/video/tv-shows/12118508/rainford-brent-how-ace-programme-helps', 'only_matching': True, }] class SkyNewsIE(SkyBaseIE): IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', 'uploader_id': '6058004172001', 'timestamp': 1567112345, 'upload_date': '20190829', }, 'add_ie': ['BrightcoveNew'], } class SkySportsNewsIE(SkyBaseIE): IE_NAME = 'sky:sports:news' _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)' _TEST = { 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', 'info_dict': { 'id': '10871916', 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', }, 'playlist_count': 2, } def _real_extract(self, url): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) entries = [] for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): entries.append(self._process_ooyala_element(webpage, sdc_el, url)) return self.playlist_result( entries, article_id, self._og_search_title(webpage), self._html_search_meta(['og:description', 'description'], webpage)) ================================================ FILE: youtube_dl/extractor/skyit.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_str, compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( dict_get, int_or_none, parse_duration, unified_timestamp, ) class SkyItPlayerIE(InfoExtractor): IE_NAME = 'player.sky.it' _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' _GEO_BYPASS = False _DOMAIN = 'sky' _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' # http://static.sky.it/static/skyplayer/conf.json _TOKEN_MAP = { 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', } def _player_url_result(self, video_id): return self.url_result( self._PLAYER_TMPL % (video_id, self._DOMAIN), SkyItPlayerIE.ie_key(), video_id) def _parse_video(self, video, video_id): title = video['title'] is_live = video.get('type') == 'live' hls_url = video.get(('streaming' if is_live else 'hls') + '_url') if not hls_url and video.get('geoblock' if is_live else 'geob'): self.raise_geo_restricted(countries=['IT']) if is_live: formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') else: formats = self._extract_akamai_formats( hls_url, video_id, {'http': 'videoplatform.sky.it'}) self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), 'description': video.get('short_desc') or None, 'timestamp': unified_timestamp(video.get('create_date')), 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), 'is_live': is_live, } def _real_extract(self, url): video_id = self._match_id(url) domain = compat_parse_qs(compat_urllib_parse_urlparse( url).query).get('domain', [None])[0] token = dict_get(self._TOKEN_MAP, (domain, 'sky')) video = self._download_json( 'https://apid.sky.it/vdp/v1/getVideoData', video_id, query={ 'caller': 'sky', 'id': video_id, 'token': token }, headers=self.geo_verification_headers()) return self._parse_video(video, video_id) class SkyItVideoIE(SkyItPlayerIE): IE_NAME = 'video.sky.it' _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', 'info_dict': { 'id': '631227', 'ext': 'mp4', 'title': 'Uomo ucciso da uno squalo in Australia', 'timestamp': 1606036192, 'upload_date': '20201122', } }, { 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', 'only_matching': True, }, { 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) return self._player_url_result(video_id) class SkyItVideoLiveIE(SkyItPlayerIE): IE_NAME = 'video.sky.it:live' _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' _TEST = { 'url': 'https://video.sky.it/diretta/tg24', 'info_dict': { 'id': '1', 'ext': 'mp4', 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', }, 'params': { # m3u8 download 'skip_download': True, }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) asset_id = compat_str(self._parse_json(self._search_regex( r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) livestream = self._download_json( 'https://apid.sky.it/vdp/v1/getLivestream', asset_id, query={'id': asset_id}) return self._parse_video(livestream, asset_id) class SkyItIE(SkyItPlayerIE): IE_NAME = 'sky.it' _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', 'info_dict': { 'id': '631201', 'ext': 'mp4', 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', 'upload_date': '20201121', 'timestamp': 1605995753, }, 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', 'info_dict': { 'id': '631227', 'ext': 'mp4', 'title': 'Uomo ucciso da uno squalo in Australia', 'timestamp': 1606036192, 'upload_date': '20201122', }, }] _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( self._VIDEO_ID_REGEX, webpage, 'video id') return self._player_url_result(video_id) class SkyItAcademyIE(SkyItIE): IE_NAME = 'skyacademy.it' _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', 'info_dict': { 'id': '523458', 'ext': 'mp4', 'title': 'Sky Academy "The Best CineCamp 2019"', 'timestamp': 1562843784, 'upload_date': '20190711', } }] _DOMAIN = 'skyacademy' _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' class SkyItArteIE(SkyItIE): IE_NAME = 'arte.sky.it' _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', 'md5': '515aee97b87d7a018b6c80727d3e7e17', 'info_dict': { 'id': '627926', 'ext': 'mp4', 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", 'upload_date': '20201106', 'timestamp': 1604664493, } }] _DOMAIN = 'skyarte' _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' class CieloTVItIE(SkyItIE): IE_NAME = 'cielotv.it' _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' _TESTS = [{ 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', 'md5': 'c4deed77552ba901c2a0d9258320304b', 'info_dict': { 'id': '499240', 'ext': 'mp4', 'title': 'Il lunedì è sempre un dramma', 'upload_date': '20190329', 'timestamp': 1553862178, } }] _DOMAIN = 'cielo' _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' class TV8ItIE(SkyItVideoIE): IE_NAME = 'tv8.it' _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' _TESTS = [{ 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', 'md5': '9ab906a3f75ea342ed928442f9dabd21', 'info_dict': { 'id': '630529', 'ext': 'mp4', 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', 'timestamp': 1605721374, 'upload_date': '20201118', } }] _DOMAIN = 'mtv8' ================================================ FILE: youtube_dl/extractor/skylinewebcams.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class SkylineWebcamsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P<id>[^/]+)\.html' _TEST = { 'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html', 'info_dict': { 'id': 'scalinata-piazza-di-spagna-barcaccia', 'ext': 'mp4', 'title': 're:^Live Webcam Scalinata di Piazza di Spagna - La Barcaccia [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Roma, veduta sulla Scalinata di Piazza di Spagna e sulla Barcaccia', 'is_live': True, }, 'params': { 'skip_download': True, } } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) stream_url = self._search_regex( r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage, 'stream url', group='url') title = self._og_search_title(webpage) description = self._og_search_description(webpage) return { 'id': video_id, 'url': stream_url, 'ext': 'mp4', 'title': self._live_title(title), 'description': description, 'is_live': True, } ================================================ FILE: youtube_dl/extractor/skynewsarabia.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( parse_iso8601, parse_duration, ) class SkyNewsArabiaBaseIE(InfoExtractor): _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' def _call_api(self, path, value): return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) def _get_limelight_media_id(self, url): return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') def _get_image_url(self, image_path_template, width='1600', height='1200'): return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) def _extract_video_info(self, video_data): video_id = compat_str(video_data['id']) topic = video_data.get('topicTitle') return { '_type': 'url_transparent', 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), 'id': video_id, 'title': video_data['headline'], 'description': video_data.get('summary'), 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), 'timestamp': parse_iso8601(video_data.get('date')), 'duration': parse_duration(video_data.get('runTime')), 'tags': video_data.get('tags', []), 'categories': [topic] if topic else [], 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, 'ie_key': 'LimelightMedia', } class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): IE_NAME = 'skynewsarabia:video' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', 'info_dict': { 'id': '794902', 'ext': 'flv', 'title': 'نصف مليون مصباح على شجرة كريسماس', 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', 'upload_date': '20151128', 'timestamp': 1448697198, 'duration': 2119, }, 'params': { # rtmp download 'skip_download': True, }, } def _real_extract(self, url): video_id = self._match_id(url) video_data = self._call_api('video', video_id) return self._extract_video_info(video_data) class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', 'info_dict': { 'id': '794549', 'ext': 'flv', 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', 'upload_date': '20151126', 'timestamp': 1448559336, 'duration': 281.6, }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', 'info_dict': { 'id': '794844', 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', }, 'playlist_mincount': 2, }] def _real_extract(self, url): article_id = self._match_id(url) article_data = self._call_api('article', article_id) media_asset = article_data['mediaAsset'] if media_asset['type'] == 'VIDEO': topic = article_data.get('topicTitle') return { '_type': 'url_transparent', 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), 'id': article_id, 'title': article_data['headline'], 'description': article_data.get('summary'), 'thumbnail': self._get_image_url(media_asset['imageUrl']), 'timestamp': parse_iso8601(article_data.get('date')), 'tags': article_data.get('tags', []), 'categories': [topic] if topic else [], 'webpage_url': url, 'ie_key': 'LimelightMedia', } entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) ================================================ FILE: youtube_dl/extractor/slideshare.py ================================================ from __future__ import unicode_literals import re import json from .common import InfoExtractor from ..compat import ( compat_urlparse, ) from ..utils import ( ExtractorError, get_element_by_id, ) class SlideshareIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', 'info_dict': { 'id': '25665706', 'ext': 'mp4', 'title': 'Managing Scale and Complexity', 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) doc = info['doc'] bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) return { '_type': 'video', 'id': info['slideshow']['id'], 'title': info['slideshow']['title'], 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], 'description': description.strip() if description else None, } ================================================ FILE: youtube_dl/extractor/slideslive.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( bool_or_none, smuggle_url, try_get, url_or_none, ) class SlidesLiveIE(InfoExtractor): _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' _TESTS = [{ # video_service_name = YOUTUBE 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', 'info_dict': { 'id': 'LMtgR8ba0b0', 'ext': 'mp4', 'title': 'GCC IA16 backend', 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', 'timestamp': 1597615266, 'upload_date': '20170925', } }, { # video_service_name = yoda 'url': 'https://slideslive.com/38935785', 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', 'info_dict': { 'id': 'RMraDYN5ozA_', 'ext': 'mp4', 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', }, 'params': { 'format': 'bestvideo', }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, }, { # video_service_name = url 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', 'only_matching': True, }, { # video_service_name = vimeo 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] subtitles = {} for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: if not isinstance(sub, dict): continue webvtt_url = url_or_none(sub.get('webvtt_url')) if not webvtt_url: continue lang = sub.get('language') or 'en' subtitles.setdefault(lang, []).append({ 'url': webvtt_url, }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), 'is_live': bool_or_none(video_data.get('is_live')), 'subtitles': subtitles, } if service_name in ('url', 'yoda'): info['title'] = video_data['title'] if service_name == 'url': info['url'] = service_id else: formats = [] _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats.extend(self._extract_m3u8_formats( _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', m3u8_id='hls', fatal=False)) formats.extend(self._extract_mpd_formats( _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) info.update({ 'id': service_id, 'formats': formats, }) else: info.update({ '_type': 'url_transparent', 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) if service_name == 'vimeo': info['url'] = smuggle_url( 'https://player.vimeo.com/video/' + service_id, {'http_headers': {'Referer': url}}) return info ================================================ FILE: youtube_dl/extractor/slutload.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor class SlutloadIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { 'id': 'TD73btpBqSxc', 'ext': 'mp4', 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, }, { 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', 'only_matching': True, }, { 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) embed_page = self._download_webpage( 'http://www.slutload.com/embed_player/%s' % video_id, video_id, 'Downloading embed page', fatal=False) if embed_page: def extract(what): return self._html_search_regex( r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, embed_page, 'video %s' % what, default=None, group='url') video_url = extract('url') if video_url: title = self._html_search_regex( r'<title>([^<]+)', embed_page, 'title', default=video_id) return { 'id': video_id, 'url': video_url, 'title': title, 'thumbnail': extract('preview'), 'age_limit': 18 } webpage = self._download_webpage( 'http://www.slutload.com/video/_/%s/' % video_id, video_id) title = self._html_search_regex( r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() info = self._parse_html5_media_entries(url, webpage, video_id)[0] info.update({ 'id': video_id, 'title': title, 'age_limit': 18, }) return info ================================================ FILE: youtube_dl/extractor/snotr.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, parse_filesize, str_to_int, ) class SnotrIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' _TESTS = [{ 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', 'ext': 'mp4', 'title': 'Drone flying through fireworks!', 'duration': 248, 'filesize_approx': 40700000, 'description': 'A drone flying through Fourth of July Fireworks', 'thumbnail': r're:^https?://.*\.jpg$', }, 'expected_warnings': ['description'], }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', 'ext': 'mp4', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, 'filesize_approx': 8500000, 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', 'thumbnail': r're:^https?://.*\.jpg$', } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) description = self._og_search_description(webpage) info_dict = self._parse_html5_media_entries( url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] view_count = str_to_int(self._html_search_regex( r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)', webpage, 'duration', fatal=False)) filesize_approx = parse_filesize(self._html_search_regex( r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)', webpage, 'filesize', fatal=False)) info_dict.update({ 'id': video_id, 'description': description, 'title': title, 'view_count': view_count, 'duration': duration, 'filesize_approx': filesize_approx, }) return info_dict ================================================ FILE: youtube_dl/extractor/sohu.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, int_or_none, try_get, ) class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', 'info_dict': { 'id': '382479172', 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', 'info_dict': { 'id': '409385080', 'ext': 'mp4', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'info_dict': { 'id': '78693464', 'ext': 'mp4', 'title': '【爱范品】第31期:MWC见不到的奇葩手机', } }, { 'note': 'Multipart video', 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml', 'info_dict': { 'id': '78910339', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', 'duration': 294, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', 'duration': 300, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', 'duration': 150, 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }] }, { 'note': 'Video with title containing dash', 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml', 'info_dict': { 'id': '78932792', 'ext': 'mp4', 'title': 'youtube-dl testing video', }, 'params': { 'skip_download': True } }] def _real_extract(self, url): def _fetch_data(vid_id, mytv=False): if mytv: base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' return self._download_json( base_data_url + vid_id, video_id, 'Downloading JSON data for %s' % vid_id, headers=self.geo_verification_headers()) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', webpage, 'video path') vid_data = _fetch_data(vid, mytv) if vid_data['play'] != 1: if vid_data.get('status') == 12: raise ExtractorError( '%s said: There\'s something wrong in the video.' % self.IE_NAME, expected=True) else: self.raise_geo_restricted( '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME) formats_json = {} for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): vid_id = vid_data['data'].get('%sVid' % format_id) if not vid_id: continue vid_id = compat_str(vid_id) formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) part_count = vid_data['data']['totalBlocks'] playlist = [] for i in range(part_count): formats = [] for format_id, format_data in formats_json.items(): allot = format_data['allot'] data = format_data['data'] clips_url = data['clipsURL'] su = data['su'] video_url = 'newflv.sohu.ccgslb.net' cdnId = None retries = 0 while 'newflv.sohu.ccgslb.net' in video_url: params = { 'prot': 9, 'file': clips_url[i], 'new': su[i], 'prod': 'flash', 'rb': 1, } if cdnId is not None: params['idc'] = cdnId download_note = 'Downloading %s video URL part %d of %d' % ( format_id, i + 1, part_count) if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] cdnId = part_info.get('nid') retries += 1 if retries > 5: raise ExtractorError('Failed to get video URL') formats.append({ 'url': video_url, 'format_id': format_id, 'filesize': int_or_none( try_get(data, lambda x: x['clipsBytes'][i])), 'width': int_or_none(data.get('width')), 'height': int_or_none(data.get('height')), 'fps': int_or_none(data.get('fps')), }) self._sort_formats(formats) playlist.append({ 'id': '%s_part%d' % (video_id, i + 1), 'title': title, 'duration': vid_data['data']['clipsDuration'][i], 'formats': formats, }) if len(playlist) == 1: info = playlist[0] info['id'] = video_id else: info = { '_type': 'multi_video', 'entries': playlist, 'id': video_id, 'title': title, } return info ================================================ FILE: youtube_dl/extractor/sonyliv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import time import uuid from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, ) class SonyLIVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', 'info_dict': { 'title': 'Bachelors Delight - Achaari Cheese Toast', 'id': '1000022678', 'ext': 'mp4', 'upload_date': '20200411', 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', 'timestamp': 1586632091, 'duration': 185, 'season_number': 1, 'episode': 'Achaari Cheese Toast', 'episode_number': 1, 'release_year': 2016, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', 'only_matching': True, }, { 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', 'only_matching': True, }, { 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', 'only_matching': True, }, { 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', 'only_matching': True, }, { 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', 'only_matching': True, }] _GEO_COUNTRIES = ['IN'] _TOKEN = None def _call_api(self, version, path, video_id): headers = {} if self._TOKEN: headers['security_token'] = self._TOKEN try: return self._download_json( 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), video_id, headers=headers)['resultObj'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: message = self._parse_json( e.cause.read().decode(), video_id)['message'] if message == 'Geoblocked Country': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(message) raise def _real_initialize(self): self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) def _real_extract(self, url): video_id = self._match_id(url) content = self._call_api( '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) if content.get('isEncrypted'): raise ExtractorError('This video is DRM protected.', expected=True) dash_url = content['videoURL'] headers = { 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) } formats = self._extract_mpd_formats( dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) formats.extend(self._extract_m3u8_formats( dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) for f in formats: f.setdefault('http_headers', {}).update(headers) self._sort_formats(formats) metadata = self._call_api( '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] title = metadata['title'] episode = metadata.get('episodeTitle') if episode and title != episode: title += ' - ' + episode return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': content.get('posterURL'), 'description': metadata.get('longDescription') or metadata.get('shortDescription'), 'timestamp': int_or_none(metadata.get('creationDate'), 1000), 'duration': int_or_none(metadata.get('duration')), 'season_number': int_or_none(metadata.get('season')), 'episode': episode, 'episode_number': int_or_none(metadata.get('episodeNumber')), 'release_year': int_or_none(metadata.get('year')), } ================================================ FILE: youtube_dl/extractor/soundcloud.py ================================================ # coding: utf-8 from __future__ import unicode_literals import itertools import re from .common import ( InfoExtractor, SearchInfoExtractor ) from ..compat import ( compat_HTTPError, compat_kwargs, compat_str, compat_urlparse, ) from ..utils import ( error_to_compat_str, ExtractorError, float_or_none, HEADRequest, int_or_none, KNOWN_EXTENSIONS, mimetype2ext, str_or_none, try_get, unified_timestamp, update_url_query, url_or_none, urlhandle_detect_ext, ) class SoundcloudEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' _TEST = { # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', 'only_matching': True, } @staticmethod def _extract_urls(webpage): return [m.group('url') for m in re.finditer( r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', webpage)] def _real_extract(self, url): query = compat_urlparse.parse_qs( compat_urlparse.urlparse(url).query) api_url = query['url'][0] secret_token = query.get('secret_token') if secret_token: api_url = update_url_query(api_url, {'secret_token': secret_token[0]}) return self.url_result(api_url) class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com To access the media, the uid of the song and a stream token must be extracted from the page source and the script must make a request to media.soundcloud.com/crossdomain.xml. Then the media can be grabbed by requesting from an url composed of the stream token and uid """ _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?!stations/track) (?P<uploader>[\w\d-]+)/ (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) ) ''' IE_NAME = 'soundcloud' _TESTS = [ { 'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', 'md5': 'ebef0a451b909710ed1d7787dddbf0d7', 'info_dict': { 'id': '62986583', 'ext': 'mp3', 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1', 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d', 'uploader': 'E.T. ExTerrestrial Music', 'uploader_id': '1571244', 'timestamp': 1349920598, 'upload_date': '20121011', 'duration': 143.216, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, } }, # geo-restricted { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '47127627', 'ext': 'mp3', 'title': 'Goldrushed', 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com', 'uploader': 'The Royal Concept', 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', 'duration': 227.155, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, # private link { 'url': 'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', 'info_dict': { 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, # private link (alt format) { 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', 'info_dict': { 'id': '123998367', 'ext': 'mp3', 'title': 'Youtube - Dl Test Video \'\' Ä↭', 'description': 'test chars: \"\'/\\ä↭', 'uploader': 'jaimeMF', 'uploader_id': '69767071', 'timestamp': 1386604920, 'upload_date': '20131209', 'duration': 9.927, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, # downloadable song { 'url': 'https://soundcloud.com/oddsamples/bus-brakes', 'md5': '7624f2351f8a3b2e7cd51522496e7631', 'info_dict': { 'id': '128590877', 'ext': 'mp3', 'title': 'Bus Brakes', 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66', 'uploader': 'oddsamples', 'uploader_id': '73680509', 'timestamp': 1389232924, 'upload_date': '20140109', 'duration': 17.346, 'license': 'cc-by-sa', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, # private link, downloadable format { 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', 'md5': '64a60b16e617d41d0bef032b7f55441e', 'info_dict': { 'id': '340344461', 'ext': 'wav', 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', 'uploader': 'Ori Uplift Music', 'uploader_id': '12563093', 'timestamp': 1504206263, 'upload_date': '20170831', 'duration': 7449.096, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, # no album art, use avatar pic for thumbnail { 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', 'md5': '59c7872bc44e5d99b7211891664760c2', 'info_dict': { 'id': '309699954', 'ext': 'mp3', 'title': 'Sideways (Prod. Mad Real)', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'uploader': 'garyvee', 'uploader_id': '2366352', 'timestamp': 1488152409, 'upload_date': '20170226', 'duration': 207.012, 'thumbnail': r're:https?://.*\.jpg', 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', 'info_dict': { 'id': '583011102', 'ext': 'mp3', 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', 'uploader': 'Micronie', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', 'duration': 180.157, 'thumbnail': r're:https?://.*\.jpg', 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, }, { # with AAC HQ format available via OAuth token 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', 'only_matching': True, }, ] _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { 'mini': 16, 'tiny': 20, 'small': 32, 'badge': 47, 't67x67': 67, 'large': 100, 't300x300': 300, 'crop': 400, 't500x500': 500, 'original': 0, } def _store_client_id(self, client_id): self._downloader.cache.store('soundcloud', 'client_id', client_id) def _update_client_id(self): webpage = self._download_webpage('https://soundcloud.com/', None) for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)): script = self._download_webpage(src, None, fatal=False) if script: client_id = self._search_regex( r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', script, 'client id', default=None) if client_id: self._CLIENT_ID = client_id self._store_client_id(client_id) return raise ExtractorError('Unable to extract client id') def _download_json(self, *args, **kwargs): non_fatal = kwargs.get('fatal') is False if non_fatal: del kwargs['fatal'] query = kwargs.get('query', {}).copy() for _ in range(2): query['client_id'] = self._CLIENT_ID kwargs['query'] = query try: return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: self._store_client_id(None) self._update_client_id() continue elif non_fatal: self._downloader.report_warning(error_to_compat_str(e)) return False raise def _real_initialize(self): self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' @classmethod def _resolv_url(cls, url): return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url def _extract_info_dict(self, info, full_title=None, secret_token=None): track_id = compat_str(info['id']) title = info['title'] format_urls = set() formats = [] query = {'client_id': self._CLIENT_ID} if secret_token: query['secret_token'] = secret_token if info.get('downloadable') and info.get('has_downloads_left'): download_url = update_url_query( self._API_V2_BASE + 'tracks/' + track_id + '/download', query) redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') if redirect_url: urlh = self._request_webpage( HEADRequest(redirect_url), track_id, fatal=False) if urlh: format_url = urlh.geturl() format_urls.add(format_url) formats.append({ 'format_id': 'download', 'ext': urlhandle_detect_ext(urlh) or 'mp3', 'filesize': int_or_none(urlh.headers.get('Content-Length')), 'url': format_url, 'preference': 10, }) def invalid_url(url): return not url or url in format_urls def add_format(f, protocol, is_preview=False): mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) if mobj: for k, v in mobj.groupdict().items(): if not f.get(k): f[k] = v format_id_list = [] if protocol: format_id_list.append(protocol) ext = f.get('ext') if ext == 'aac': f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: format_id_list.append(v) preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) if preview: format_id_list.append('preview') abr = f.get('abr') if abr: f['abr'] = int(abr) if protocol == 'hls': protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' else: protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), 'protocol': protocol, 'preference': -10 if preview else None, }) formats.append(f) # New API transcodings = try_get( info, lambda x: x['media']['transcodings'], list) or [] for t in transcodings: if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) if not format_url: continue stream = self._download_json( format_url, track_id, query=query, fatal=False) if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) if invalid_url(stream_url): continue format_urls.add(stream_url) stream_format = t.get('format') or {} protocol = stream_format.get('protocol') if protocol != 'hls' and '/hls' in format_url: protocol = 'hls' ext = None preset = str_or_none(t.get('preset')) if preset: ext = preset.split('_')[0] if ext not in KNOWN_EXTENSIONS: ext = mimetype2ext(stream_format.get('mime_type')) add_format({ 'url': stream_url, 'ext': ext, }, 'http' if protocol == 'progressive' else protocol, t.get('snipped') or '/preview/' in format_url) for f in formats: f['vcodec'] = 'none' if not formats and info.get('policy') == 'BLOCK': self.raise_geo_restricted() self._sort_formats(formats) user = info.get('user') or {} thumbnails = [] artwork_url = info.get('artwork_url') thumbnail = artwork_url or user.get('avatar_url') if isinstance(thumbnail, compat_str): if re.search(self._IMAGE_REPL_RE, thumbnail): for image_id, size in self._ARTWORK_MAP.items(): i = { 'id': image_id, 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), } if image_id == 'tiny' and not artwork_url: size = 18 elif image_id == 'original': i['preference'] = 10 if size: i.update({ 'width': size, 'height': size, }) thumbnails.append(i) else: thumbnails = [{'url': thumbnail}] def extract_count(key): return int_or_none(info.get('%s_count' % key)) return { 'id': track_id, 'uploader': user.get('username'), 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), 'uploader_url': user.get('permalink_url'), 'timestamp': unified_timestamp(info.get('created_at')), 'title': title, 'description': info.get('description'), 'thumbnails': thumbnails, 'duration': float_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), 'license': info.get('license'), 'view_count': extract_count('playback'), 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), 'genre': info.get('genre'), 'formats': formats } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) track_id = mobj.group('track_id') query = {} if track_id: info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id token = mobj.group('secret_token') if token: query['secret_token'] = token else: full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title') token = mobj.group('token') if token: resolve_title += '/%s' % token info_json_url = self._resolv_url(self._BASE_URL + resolve_title) info = self._download_json( info_json_url, full_title, 'Downloading info JSON', query=query) return self._extract_info_dict(info, full_title, token) class SoundcloudPlaylistBaseIE(SoundcloudIE): def _extract_set(self, playlist, token=None): playlist_id = compat_str(playlist['id']) tracks = playlist.get('tracks') or [] if not all([t.get('permalink_url') for t in tracks]) and token: tracks = self._download_json( self._API_V2_BASE + 'tracks', playlist_id, 'Downloading tracks', query={ 'ids': ','.join([compat_str(t['id']) for t in tracks]), 'playlistId': playlist_id, 'playlistSecretToken': token, }) entries = [] for track in tracks: track_id = str_or_none(track.get('id')) url = track.get('permalink_url') if not url: if not track_id: continue url = self._API_V2_BASE + 'tracks/' + track_id if token: url += '?secret_token=' + token entries.append(self.url_result( url, SoundcloudIE.ie_key(), track_id)) return self.playlist_result( entries, playlist_id, playlist.get('title'), playlist.get('description')) class SoundcloudSetIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', 'description': 'md5:71d07087c7a449e8941a70a29e34671e', }, 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title') token = mobj.group('token') if token: full_title += '/' + token info = self._download_json(self._resolv_url( self._BASE_URL + full_title), full_title) if 'errors' in info: msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) return self._extract_set(info, token) class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { 'limit': 200, 'linked_partitioning': '1', } query = COMMON_QUERY.copy() query['offset'] = 0 next_href = base_url entries = [] for i in itertools.count(): response = self._download_json( next_href, playlist_id, 'Downloading track page %s' % (i + 1), query=query) collection = response['collection'] if not isinstance(collection, list): collection = [] # Empty collection may be returned, in this case we proceed # straight to next_href def resolve_entry(candidates): for cand in candidates: if not isinstance(cand, dict): continue permalink_url = url_or_none(cand.get('permalink_url')) if not permalink_url: continue return self.url_result( permalink_url, SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None, str_or_none(cand.get('id')), cand.get('title')) for e in collection: entry = resolve_entry((e, e.get('track'), e.get('playlist'))) if entry: entries.append(entry) next_href = response.get('next_href') if not next_href: break next_href = response['next_href'] parsed_next_href = compat_urlparse.urlparse(next_href) query = compat_urlparse.parse_qs(parsed_next_href.query) query.update(COMMON_QUERY) return { '_type': 'playlist', 'id': playlist_id, 'title': playlist_title, 'entries': entries, } class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ (?P<user>[^/]+) (?:/ (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight) )? /?(?:[?#].*)?$ ''' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/soft-cell-official', 'info_dict': { 'id': '207965082', 'title': 'Soft Cell (All)', }, 'playlist_mincount': 28, }, { 'url': 'https://soundcloud.com/soft-cell-official/tracks', 'info_dict': { 'id': '207965082', 'title': 'Soft Cell (Tracks)', }, 'playlist_mincount': 27, }, { 'url': 'https://soundcloud.com/soft-cell-official/albums', 'info_dict': { 'id': '207965082', 'title': 'Soft Cell (Albums)', }, 'playlist_mincount': 1, }, { 'url': 'https://soundcloud.com/jcv246/sets', 'info_dict': { 'id': '12982173', 'title': 'Jordi / cv (Sets)', }, 'playlist_mincount': 2, }, { 'url': 'https://soundcloud.com/jcv246/reposts', 'info_dict': { 'id': '12982173', 'title': 'Jordi / cv (Reposts)', }, 'playlist_mincount': 6, }, { 'url': 'https://soundcloud.com/clalberg/likes', 'info_dict': { 'id': '11817582', 'title': 'clalberg (Likes)', }, 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { 'id': '7098329', 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, }] _BASE_URL_MAP = { 'all': 'stream/users/%s', 'tracks': 'users/%s/tracks', 'albums': 'users/%s/albums', 'sets': 'users/%s/playlists', 'reposts': 'stream/users/%s/reposts', 'likes': 'users/%s/likes', 'spotlight': 'users/%s/spotlight', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') user = self._download_json( self._resolv_url(self._BASE_URL + uploader), uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' return self._extract_playlist( self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'], str_or_none(user.get('id')), '%s (%s)' % (user['username'], resource.capitalize())) class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' IE_NAME = 'soundcloud:trackstation' _TESTS = [{ 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', 'info_dict': { 'id': '286017854', 'title': 'Track station: your text', }, 'playlist_mincount': 47, }] def _real_extract(self, url): track_name = self._match_id(url) track = self._download_json(self._resolv_url(url), track_name) track_id = self._search_regex( r'soundcloud:track-stations:(\d+)', track['id'], 'track id') return self._extract_playlist( self._API_V2_BASE + 'stations/%s/tracks' % track['id'], track_id, 'Track station: %s' % track['title']) class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ 'url': 'https://api.soundcloud.com/playlists/4110309', 'info_dict': { 'id': '4110309', 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', 'description': 're:.*?TILT Brass - Bowery Poetry Club', }, 'playlist_count': 6, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') query = {} token = mobj.group('token') if token: query['secret_token'] = token data = self._download_json( self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query) return self._extract_set(data, token) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): IE_NAME = 'soundcloud:search' IE_DESC = 'Soundcloud search' _MAX_RESULTS = float('inf') _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { 'title': 'post-avant jazzcore', }, 'playlist_count': 15, }] _SEARCH_KEY = 'scsearch' _MAX_RESULTS_PER_PAGE = 200 _DEFAULT_RESULTS_PER_PAGE = 50 def _get_collection(self, endpoint, collection_id, **query): limit = min( query.get('limit', self._DEFAULT_RESULTS_PER_PAGE), self._MAX_RESULTS_PER_PAGE) query.update({ 'limit': limit, 'linked_partitioning': 1, 'offset': 0, }) next_url = update_url_query(self._API_V2_BASE + endpoint, query) collected_results = 0 for i in itertools.count(1): response = self._download_json( next_url, collection_id, 'Downloading page {0}'.format(i), 'Unable to download API page') collection = response.get('collection', []) if not collection: break collection = list(filter(bool, collection)) collected_results += len(collection) for item in collection: yield self.url_result(item['uri'], SoundcloudIE.ie_key()) if not collection or collected_results >= limit: break next_url = response.get('next_href') if not next_url: break def _get_n_results(self, query, n): tracks = self._get_collection('search/tracks', query, limit=n, q=query) return self.playlist_result(tracks, playlist_title=query) ================================================ FILE: youtube_dl/extractor/soundgasm.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor class SoundgasmIE(InfoExtractor): IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', 'md5': '010082a2c802c5275bb00030743e75ad', 'info_dict': { 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', 'ext': 'm4a', 'title': 'Piano sample', 'description': 'Royalty Free Sample Music', 'uploader': 'ytdl', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) audio_url = self._html_search_regex( r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'audio URL', group='url') title = self._search_regex( r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', webpage, 'title', default=display_id) description = self._html_search_regex( (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', r'(?s)<li>Description:\s(.*?)<\/li>'), webpage, 'description', fatal=False) audio_id = self._search_regex( r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) return { 'id': audio_id, 'display_id': display_id, 'url': audio_url, 'vcodec': 'none', 'title': title, 'description': description, 'uploader': mobj.group('user'), } class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', 'info_dict': { 'id': 'ytdl', }, 'playlist_count': 1, } def _real_extract(self, url): profile_id = self._match_id(url) webpage = self._download_webpage(url, profile_id) entries = [ self.url_result(audio_url, 'Soundgasm') for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] return self.playlist_result(entries, profile_id) ================================================ FILE: youtube_dl/extractor/southpark.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', 'info_dict': { 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', 'timestamp': 1112760000, 'upload_date': '20050406', }, }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, }, { 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', 'only_matching': True, }] def _get_feed_query(self, uri): return { 'accountOverride': 'intl.mtvi.com', 'arcEp': 'shared.southpark.global', 'ep': '90877963', 'imageEp': 'shared.southpark.global', 'mgid': uri, } class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' _LANG = 'es' _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', 'info_dict': { 'title': 'Cartman Consigue Una Sonda Anal', 'description': 'Cartman Consigue Una Sonda Anal', }, 'playlist_count': 4, 'skip': 'Geo-restricted', }] class SouthParkDeIE(SouthParkIE): IE_NAME = 'southpark.de' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', 'timestamp': 1380160800, 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', 'info_dict': { 'title': 'Hashtag „Aufwärmen“', 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', }, 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'info_dict': { 'title': 'Hashtag „Aufwärmen“', 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', }, 'playlist_count': 3, }, { 'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1', 'only_matching': True, }] class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', 'info_dict': { 'title': 'Freemium Isn\'t Free', 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', }, 'playlist_mincount': 3, }] class SouthParkDkIE(SouthParkIE): IE_NAME = 'southparkstudios.dk' _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', 'info_dict': { 'title': 'Grounded Vindaloop', 'description': 'Butters is convinced he\'s living in a virtual reality.', }, 'playlist_mincount': 3, }, { 'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1', 'only_matching': True, }, { 'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1', 'only_matching': True, }] ================================================ FILE: youtube_dl/extractor/spankbang.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, merge_dicts, parse_duration, parse_resolution, str_to_int, url_or_none, urlencode_postdata, urljoin, ) class SpankBangIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:[^/]+\.)?spankbang\.com/ (?: (?P<id>[\da-z]+)/(?:video|play|embed)\b| [\da-z]+-(?P<id_2>[\da-z]+)/playlist/[^/?#&]+ ) ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', 'info_dict': { 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', 'description': 'dillion harper masturbates on a bed', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'timestamp': 1422571989, 'upload_date': '20150129', 'age_limit': 18, } }, { # 480p only 'url': 'http://spankbang.com/1vt0/video/solvane+gangbang', 'only_matching': True, }, { # no uploader 'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2', 'only_matching': True, }, { # mobile page 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', 'only_matching': True, }, { # 4k 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', 'only_matching': True, }, { 'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/', 'only_matching': True, }, { 'url': 'https://m.spankbang.com/3vvn/play', 'only_matching': True, }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, }, { 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) formats = [] def extract_format(format_id, format_url): f_url = url_or_none(format_url) if not f_url: return f = parse_resolution(format_id) ext = determine_ext(f_url) if format_id.startswith('m3u8') or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( f_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) elif format_id.startswith('mpd') or ext == 'mpd': formats.extend(self._extract_mpd_formats( f_url, video_id, mpd_id='dash', fatal=False)) elif ext == 'mp4' or f.get('width') or f.get('height'): f.update({ 'url': f_url, 'format_id': format_id, }) formats.append(f) STREAM_URL_PREFIX = 'stream_url_' for mobj in re.finditer( r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' % STREAM_URL_PREFIX, webpage): extract_format(mobj.group('id', 'url')) if not formats: stream_key = self._search_regex( r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'stream key', group='value') stream = self._download_json( 'https://spankbang.com/api/videos/stream', video_id, 'Downloading stream JSON', data=urlencode_postdata({ 'id': stream_key, 'data': 0, }), headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', }) for format_id, format_url in stream.items(): if format_url and isinstance(format_url, list): format_url = format_url[0] extract_format(format_id, format_url) self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) info = self._search_json_ld(webpage, video_id, default={}) title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None) description = self._search_regex( r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', webpage, 'description', default=None) thumbnail = self._og_search_thumbnail(webpage, default=None) uploader = self._html_search_regex( (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>', r'class="user"[^>]*><img[^>]+>([^<]+)'), webpage, 'uploader', default=None) duration = parse_duration(self._search_regex( r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) age_limit = self._rta_search(webpage) return merge_dicts({ 'id': video_id, 'title': title or video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'duration': duration, 'view_count': view_count, 'formats': formats, 'age_limit': age_limit, }, info ) class SpankBangPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, 'playlist_mincount': 40, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( urljoin(url, mobj.group('path')), ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) for mobj in re.finditer( r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' % re.escape(display_id), webpage)] title = self._html_search_regex( r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) ================================================ FILE: youtube_dl/extractor/spankwire.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, merge_dicts, str_or_none, str_to_int, url_or_none, ) class SpankwireIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)?spankwire\.com/ (?: [^/]+/video| EmbedPlayer\.aspx/?\?.*?\bArticleId= ) (?P<id>\d+) ''' _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', 'timestamp': 1178587885, 'upload_date': '20070508', 'average_rating': float, 'view_count': int, 'comment_count': int, 'age_limit': 18, 'categories': list, 'tags': list, }, }, { # download URL pattern: */mp4_<format_id>_<video_id>.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', 'md5': '09b3c20833308b736ae8902db2f8d7e6', 'info_dict': { 'id': '1921551', 'ext': 'mp4', 'title': 'Titcums Compiloation I', 'description': 'cum on tits', 'uploader': 'dannyh78999', 'uploader_id': '3056053', 'upload_date': '20150822', 'age_limit': 18, }, 'params': { 'proxy': '127.0.0.1:8118' }, 'skip': 'removed', }, { 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', webpage) def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) title = video['title'] formats = [] videos = video.get('videos') if isinstance(videos, dict): for format_id, format_url in videos.items(): video_url = url_or_none(format_url) if not format_url: continue height = int_or_none(self._search_regex( r'(\d+)[pP]', format_id, 'height', default=None)) m = re.search( r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) if m: tbr = int(m.group('tbr')) height = height or int(m.group('height')) else: tbr = None formats.append({ 'url': video_url, 'format_id': '%dp' % height if height else format_id, 'height': height, 'tbr': tbr, }) m3u8_url = url_or_none(video.get('HLS')) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) view_count = str_to_int(video.get('viewed')) thumbnails = [] for preference, t in enumerate(('', '2x'), start=0): thumbnail_url = url_or_none(video.get('poster%s' % t)) if not thumbnail_url: continue thumbnails.append({ 'url': thumbnail_url, 'preference': preference, }) def extract_names(key): entries_list = video.get(key) if not isinstance(entries_list, list): return entries = [] for entry in entries_list: name = str_or_none(entry.get('name')) if name: entries.append(name) return entries categories = extract_names('categories') tags = extract_names('tags') uploader = None info = {} webpage = self._download_webpage( 'https://www.spankwire.com/_/video%s/' % video_id, video_id, fatal=False) if webpage: info = self._search_json_ld(webpage, video_id, default={}) thumbnail_url = None if 'thumbnail' in info: thumbnail_url = url_or_none(info['thumbnail']) del info['thumbnail'] if not thumbnail_url: thumbnail_url = self._og_search_thumbnail(webpage) if thumbnail_url: thumbnails.append({ 'url': thumbnail_url, 'preference': 10, }) uploader = self._html_search_regex( r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', webpage, 'uploader', fatal=False) if not view_count: view_count = str_to_int(self._search_regex( r'data-views=["\']([\d,.]+)', webpage, 'view count', fatal=False)) return merge_dicts({ 'id': video_id, 'title': title, 'description': video.get('description'), 'duration': int_or_none(video.get('duration')), 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': str_or_none(video.get('userId')), 'timestamp': int_or_none(video.get('time_approved_on')), 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, 'comment_count': int_or_none(video.get('comments')), 'age_limit': 18, 'categories': categories, 'tags': tags, 'formats': formats, }, info) ================================================ FILE: youtube_dl/extractor/spiegel.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { 'id': 'II0BUyxY', 'display_id': '1259285', 'ext': 'mp4', 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 48.0, 'upload_date': '20130311', 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', 'only_matching': True, }, { 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', 'only_matching': True, }, { 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', 'only_matching': True, }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) media_id = self._html_search_regex( r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', webpage, 'media id', group='id') return { '_type': 'url_transparent', 'id': video_id, 'display_id': video_id, 'url': 'jwplatform:%s' % media_id, 'title': self._og_search_title(webpage, default=None), 'ie_key': JWPlatformIE.ie_key(), } ================================================ FILE: youtube_dl/extractor/spike.py ================================================ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor class BellatorIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { 'title': 'Michael Page vs. Evangelista Cyborg', 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05', }, 'playlist_count': 3, }, { 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', 'only_matching': True, }] _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13', 'info_dict': { 'id': '37ace3a8-1df6-48be-85b8-38df8229e241', 'ext': 'mp4', 'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1', 'description': 'md5:a739ca8f978a7802f67f8016d27ce114', }, 'params': { # m3u8 download 'skip_download': True, }, }] _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _GEO_COUNTRIES = ['US'] def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', 'imageEp': 'paramountnetwork.com', 'mgid': uri, } ================================================ FILE: youtube_dl/extractor/sport5.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ExtractorError class Sport5IE(InfoExtractor): _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', 'info_dict': { 'id': 's5-Y59xx1-GUh2', 'ext': 'mp4', 'title': 'ולנסיה-קורדובה 0:3', 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', 'duration': 228, 'categories': list, }, 'skip': 'Blocked outside of Israel', }, { 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', 'info_dict': { 'id': 's5-SiXxx1-hKh2', 'ext': 'mp4', 'title': 'GOALS_CELTIC_270914.mp4', 'description': '', 'duration': 87, 'categories': list, }, 'skip': 'Blocked outside of Israel', } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) media_id = mobj.group('id') webpage = self._download_webpage(url, media_id) video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id') metadata = self._download_xml( 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, video_id) error = metadata.find('./Error') if error is not None: raise ExtractorError( '%s returned error: %s - %s' % ( self.IE_NAME, error.find('./Name').text, error.find('./Description').text), expected=True) title = metadata.find('./Title').text description = metadata.find('./Description').text duration = int(metadata.find('./Duration').text) posters_el = metadata.find('./PosterLinks') thumbnails = [{ 'url': thumbnail.text, 'width': int(thumbnail.get('width')), 'height': int(thumbnail.get('height')), } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] categories_el = metadata.find('./Categories') categories = [ cat.get('name') for cat in categories_el.findall('./Category') ] if categories_el is not None else [] formats = [{ 'url': fmt.text, 'ext': 'mp4', 'vbr': int(fmt.get('bitrate')), 'width': int(fmt.get('width')), 'height': int(fmt.get('height')), } for fmt in metadata.findall('./PlaybackLinks/FileURL')] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnails': thumbnails, 'duration': duration, 'categories': categories, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/sportbox.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, js_to_json, merge_dicts, ) class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { 'id': '109158', 'ext': 'mp4', 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 292, 'view_count': int, 'timestamp': 1426237001, 'upload_date': '20150313', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', 'only_matching': True, }, { 'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'only_matching': True, }, { 'url': 'https://news.sportbox.ru/vdl/player/media/109158', 'only_matching': True, }, { 'url': 'https://matchtv.ru/vdl/player/media/109158', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', webpage) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) sources = self._parse_json( self._search_regex( r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n', webpage, 'sources'), video_id, transform_source=js_to_json) formats = [] for source in sources: src = source.get('src') if not src: continue if determine_ext(src) == 'm3u8': formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: formats.append({ 'url': src, }) self._sort_formats(formats) player = self._parse_json( self._search_regex( r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage, 'player options', default='{}'), video_id, transform_source=js_to_json) media_id = player['mediaId'] info = self._search_json_ld(webpage, media_id, default={}) view_count = int_or_none(self._search_regex( r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) return merge_dicts(info, { 'id': media_id, 'title': self._og_search_title(webpage, default=None) or media_id, 'thumbnail': player.get('poster'), 'duration': int_or_none(player.get('duration')), 'view_count': view_count, 'formats': formats, }) ================================================ FILE: youtube_dl/extractor/sportdeutschland.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( clean_html, float_or_none, int_or_none, parse_iso8601, strip_or_none, try_get, ) class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', 'duration': 16106.36, }, 'params': { 'noplaylist': True, # m3u8 download 'skip_download': True, }, }, { 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { 'id': 'c6e2fdd01f63013854c47054d2ab776f', 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', 'duration': 31397, }, 'playlist_count': 2, }, { 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) data = self._download_json( 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, display_id, query={'access_token': 'true'}) asset = data['asset'] title = (asset.get('title') or asset['label']).strip() asset_id = asset.get('id') or asset.get('uuid') info = { 'id': asset_id, 'title': title, 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), 'duration': int_or_none(asset.get('seconds')), } videos = asset.get('videos') or [] if len(videos) > 1: playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] if playlist_id: if self._downloader.params.get('noplaylist'): videos = [videos[int(playlist_id)]] self.to_screen('Downloading just a single video because of --no-playlist') else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) def entries(): for i, video in enumerate(videos, 1): video_id = video.get('uuid') video_url = video.get('url') if not (video_id and video_url): continue formats = self._extract_m3u8_formats( video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) if not formats: continue yield { 'id': video_id, 'formats': formats, 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), 'duration': float_or_none(video.get('duration')), } info.update({ '_type': 'multi_video', 'entries': entries(), }) else: formats = self._extract_m3u8_formats( videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) info.update({ 'formats': formats, 'display_id': asset.get('permalink'), 'thumbnail': try_get(asset, lambda x: x['images'][0]), 'categories': [section_title] if section_title else None, 'view_count': int_or_none(asset.get('views')), 'is_live': asset.get('is_live') is True, 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), }) return info ================================================ FILE: youtube_dl/extractor/spotify.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( clean_podcast_url, float_or_none, int_or_none, strip_or_none, try_get, unified_strdate, ) class SpotifyBaseIE(InfoExtractor): _ACCESS_TOKEN = None _OPERATION_HASHES = { 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', } _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)' def _real_initialize(self): self._ACCESS_TOKEN = self._download_json( 'https://open.spotify.com/get_access_token', None)['accessToken'] def _call_api(self, operation, video_id, variables): return self._download_json( 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ 'operationName': 'query' + operation, 'variables': json.dumps(variables), 'extensions': json.dumps({ 'persistedQuery': { 'sha256Hash': self._OPERATION_HASHES[operation], }, }) }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] def _extract_episode(self, episode, series): episode_id = episode['id'] title = episode['name'].strip() formats = [] audio_preview = episode.get('audioPreview') or {} audio_preview_url = audio_preview.get('url') if audio_preview_url: f = { 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), 'vcodec': 'none', } audio_preview_format = audio_preview.get('format') if audio_preview_format: f['format_id'] = audio_preview_format mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) if mobj: f.update({ 'abr': int(mobj.group(2)), 'ext': mobj.group(1).lower(), }) formats.append(f) for item in (try_get(episode, lambda x: x['audio']['items']) or []): item_url = item.get('url') if not (item_url and item.get('externallyHosted')): continue formats.append({ 'url': clean_podcast_url(item_url), 'vcodec': 'none', }) thumbnails = [] for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): source_url = source.get('url') if not source_url: continue thumbnails.append({ 'url': source_url, 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), }) return { 'id': episode_id, 'title': title, 'formats': formats, 'thumbnails': thumbnails, 'description': strip_or_none(episode.get('description')), 'duration': float_or_none(try_get( episode, lambda x: x['duration']['totalMilliseconds']), 1000), 'release_date': unified_strdate(try_get( episode, lambda x: x['releaseDate']['isoString'])), 'series': series, } class SpotifyIE(SpotifyBaseIE): IE_NAME = 'spotify' _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' _TEST = { 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', 'info_dict': { 'id': '4Z7GAJ50bgctf6uclHlWKo', 'ext': 'mp3', 'title': 'From the archive: Why time management is ruining our lives', 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', 'duration': 2083.605, 'release_date': '20201217', 'series': "The Guardian's Audio Long Reads", } } def _real_extract(self, url): episode_id = self._match_id(url) episode = self._call_api('Episode', episode_id, { 'uri': 'spotify:episode:' + episode_id })['episode'] return self._extract_episode( episode, try_get(episode, lambda x: x['podcast']['name'])) class SpotifyShowIE(SpotifyBaseIE): IE_NAME = 'spotify:show' _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' _TEST = { 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', 'info_dict': { 'id': '4PM9Ke6l66IRNpottHKV9M', 'title': 'The Story from the Guardian', 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', }, 'playlist_mincount': 36, } def _real_extract(self, url): show_id = self._match_id(url) podcast = self._call_api('ShowEpisodes', show_id, { 'limit': 1000000000, 'offset': 0, 'uri': 'spotify:show:' + show_id, })['podcast'] podcast_name = podcast.get('name') entries = [] for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): episode = item.get('episode') if not episode: continue entries.append(self._extract_episode(episode, podcast_name)) return self.playlist_result( entries, show_id, podcast_name, podcast.get('description')) ================================================ FILE: youtube_dl/extractor/spreaker.py ================================================ # coding: utf-8 from __future__ import unicode_literals import itertools from .common import InfoExtractor from ..compat import compat_str from ..utils import ( float_or_none, int_or_none, str_or_none, try_get, unified_timestamp, url_or_none, ) def _extract_episode(data, episode_id=None): title = data['title'] download_url = data['download_url'] series = try_get(data, lambda x: x['show']['title'], compat_str) uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) thumbnails = [] for image in ('image_original', 'image_medium', 'image'): image_url = url_or_none(data.get('%s_url' % image)) if image_url: thumbnails.append({'url': image_url}) def stats(key): return int_or_none(try_get( data, (lambda x: x['%ss_count' % key], lambda x: x['stats']['%ss' % key]))) def duration(key): return float_or_none(data.get(key), scale=1000) return { 'id': compat_str(episode_id or data['episode_id']), 'url': download_url, 'display_id': data.get('permalink'), 'title': title, 'description': data.get('description'), 'timestamp': unified_timestamp(data.get('published_at')), 'uploader': uploader, 'uploader_id': str_or_none(data.get('author_id')), 'creator': uploader, 'duration': duration('duration') or duration('length'), 'view_count': stats('play'), 'like_count': stats('like'), 'comment_count': stats('message'), 'format': 'MPEG Layer 3', 'format_id': 'mp3', 'container': 'mp3', 'ext': 'mp3', 'thumbnails': thumbnails, 'series': series, 'extractor_key': SpreakerIE.ie_key(), } class SpreakerIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// api\.spreaker\.com/ (?: (?:download/)?episode| v2/episodes )/ (?P<id>\d+) ''' _TESTS = [{ 'url': 'https://api.spreaker.com/episode/12534508', 'info_dict': { 'id': '12534508', 'display_id': 'swm-ep15-how-to-market-your-music-part-2', 'ext': 'mp3', 'title': 'EP:15 | Music Marketing (Likes) - Part 2', 'description': 'md5:0588c43e27be46423e183076fa071177', 'timestamp': 1502250336, 'upload_date': '20170809', 'uploader': 'SWM', 'uploader_id': '9780658', 'duration': 1063.42, 'view_count': int, 'like_count': int, 'comment_count': int, 'series': 'Success With Music (SWM)', }, }, { 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', 'only_matching': True, }, { 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', 'only_matching': True, }] def _real_extract(self, url): episode_id = self._match_id(url) data = self._download_json( 'https://api.spreaker.com/v2/episodes/%s' % episode_id, episode_id)['response']['episode'] return _extract_episode(data, episode_id) class SpreakerPageIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) episode_id = self._search_regex( (r'data-episode_id=["\'](?P<id>\d+)', r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') return self.url_result( 'https://api.spreaker.com/episode/%s' % episode_id, ie=SpreakerIE.ie_key(), video_id=episode_id) class SpreakerShowIE(InfoExtractor): _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' _TESTS = [{ 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, 'playlist_mincount': 118, }] def _entries(self, show_id): for page_num in itertools.count(1): episodes = self._download_json( 'https://api.spreaker.com/show/%s/episodes' % show_id, show_id, note='Downloading JSON page %d' % page_num, query={ 'page': page_num, 'max_per_page': 100, }) pager = try_get(episodes, lambda x: x['response']['pager'], dict) if not pager: break results = pager.get('results') if not results or not isinstance(results, list): break for result in results: if not isinstance(result, dict): continue yield _extract_episode(result) if page_num == pager.get('last_page'): break def _real_extract(self, url): show_id = self._match_id(url) return self.playlist_result(self._entries(show_id), playlist_id=show_id) class SpreakerShowPageIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.spreaker.com/show/success-with-music', 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) show_id = self._search_regex( r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') return self.url_result( 'https://api.spreaker.com/show/%s' % show_id, ie=SpreakerShowIE.ie_key(), video_id=show_id) ================================================ FILE: youtube_dl/extractor/springboardplatform.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, xpath_attr, xpath_text, xpath_element, unescapeHTML, unified_timestamp, ) class SpringboardPlatformIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// cms\.springboardplatform\.com/ (?: (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)| xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) ) ''' _TESTS = [{ 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', 'md5': '5c3cb7b5c55740d482561099e920f192', 'info_dict': { 'id': '981017', 'ext': 'mp4', 'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', 'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1409132328, 'upload_date': '20140827', 'duration': 193, }, }, { 'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1', 'only_matching': True, }, { 'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10', 'only_matching': True, }, { 'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return [ mobj.group('url') for mobj in re.finditer( r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', webpage)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') or mobj.group('id_2') index = mobj.group('index') or mobj.group('index_2') video = self._download_xml( 'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s' % (index, video_id), video_id) item = xpath_element(video, './/item', 'item', fatal=True) content = xpath_element( item, './{http://search.yahoo.com/mrss/}content', 'content', fatal=True) title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True)) video_url = content.attrib['url'] if 'error_video.mp4' in video_url: raise ExtractorError( 'Video %s no longer exists' % video_id, expected=True) duration = int_or_none(content.get('duration')) tbr = int_or_none(content.get('bitrate')) filesize = int_or_none(content.get('fileSize')) width = int_or_none(content.get('width')) height = int_or_none(content.get('height')) description = unescapeHTML(xpath_text( item, './description', 'description')) thumbnail = xpath_attr( item, './{http://search.yahoo.com/mrss/}thumbnail', 'url', 'thumbnail') timestamp = unified_timestamp(xpath_text( item, './{http://cms.springboardplatform.com/namespaces.html}created', 'timestamp')) formats = [{ 'url': video_url, 'format_id': 'http', 'tbr': tbr, 'filesize': filesize, 'width': width, 'height': height, }] m3u8_format = formats[0].copy() m3u8_format.update({ 'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8', 'ext': 'mp4', 'format_id': 'hls', 'protocol': 'm3u8_native', }) formats.append(m3u8_format) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'formats': formats, } ================================================ FILE: youtube_dl/extractor/sprout.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( int_or_none, smuggle_url, update_url_query, ) class SproutIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', 'info_dict': { 'id': 'bm0foJFaTKqb', 'ext': 'mp4', 'title': 'Robot Bike Race', 'description': 'md5:436b1d97117cc437f54c383f4debc66d', 'timestamp': 1606148940, 'upload_date': '20201123', 'uploader': 'NBCU-MPAT', }, 'params': { 'skip_download': True, }, }, { 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', 'only_matching': True, }, { 'url': 'https://www.universalkids.com/watch/robot-bike-race', 'only_matching': True, }] _GEO_COUNTRIES = ['US'] def _real_extract(self, url): display_id = self._match_id(url) mpx_metadata = self._download_json( # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ 'https://www.universalkids.com/_api/videos/' + display_id, display_id)['mpxMetadata'] media_pid = mpx_metadata['mediaPid'] theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid query = { 'mbr': 'true', 'manifest': 'm3u', } if mpx_metadata.get('entitlement') == 'auth': query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') theplatform_url = smuggle_url( update_url_query(theplatform_url, query), { 'force_smil_url': True, 'geo_countries': self._GEO_COUNTRIES, }) return { '_type': 'url_transparent', 'id': media_pid, 'url': theplatform_url, 'series': mpx_metadata.get('seriesName'), 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), 'ie_key': 'ThePlatform', } ================================================ FILE: youtube_dl/extractor/srgssr.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, parse_iso8601, qualities, try_get, ) class SRGSSRIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| srgssr ): (?P<bu> srf|rts|rsi|rtr|swi ):(?:[^:]+:)? (?P<type> video|audio ): (?P<id> [0-9a-f\-]{36}|\d+ ) ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } _DEFAULT_LANGUAGE_CODES = { 'srf': 'de', 'rts': 'fr', 'rsi': 'it', 'rtr': 'rm', 'swi': 'en', } def _get_tokenized_src(self, url, video_id, format_id): token = self._download_json( 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: url += ('?' if '?' not in url else '&') + auth_params return url def _get_media_data(self, bu, media_type, media_id): query = {'onlyChapters': True} if media_type == 'video' else {} full_media_data = self._download_json( 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' % (bu, media_type, media_id), media_id, query=query)['chapterList'] try: media_data = next( x for x in full_media_data if x.get('id') == media_id) except StopIteration: raise ExtractorError('No media information found') block_reason = media_data.get('blockReason') if block_reason and block_reason in self._ERRORS: message = self._ERRORS[block_reason] if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) return media_data def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() media_data = self._get_media_data(bu, media_type, media_id) title = media_data['title'] formats = [] q = qualities(['SD', 'HD']) for source in (media_data.get('resourceList') or []): format_url = source.get('url') if not format_url: continue protocol = source.get('protocol') quality = source.get('quality') format_id = [] for e in (protocol, source.get('encoding'), quality): if e: format_id.append(e) format_id = '-'.join(format_id) if protocol in ('HDS', 'HLS'): if source.get('tokenType') == 'AKAMAI': format_url = self._get_tokenized_src( format_url, media_id, format_id) formats.extend(self._extract_akamai_formats( format_url, media_id)) elif protocol == 'HLS': formats.extend(self._extract_m3u8_formats( format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) elif protocol in ('HTTP', 'HTTPS'): formats.append({ 'format_id': format_id, 'url': format_url, 'quality': q(quality), }) # This is needed because for audio medias the podcast url is usually # always included, even if is only an audio segment and not the # whole episode. if int_or_none(media_data.get('position')) == 0: for p in ('S', 'H'): podcast_url = media_data.get('podcast%sdUrl' % p) if not podcast_url: continue quality = p + 'D' formats.append({ 'format_id': 'PODCAST-' + quality, 'url': podcast_url, 'quality': q(quality), }) self._sort_formats(formats) subtitles = {} if media_type == 'video': for sub in (media_data.get('subtitleList') or []): sub_url = sub.get('url') if not sub_url: continue lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] subtitles.setdefault(lang, []).append({ 'url': sub_url, }) return { 'id': media_id, 'title': title, 'description': media_data.get('description'), 'timestamp': parse_iso8601(media_data.get('date')), 'thumbnail': media_data.get('imageUrl'), 'duration': float_or_none(media_data.get('duration'), 1000), 'subtitles': subtitles, 'formats': formats, } class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' _VALID_URL = r'''(?x) https?:// (?:(?:www|play)\.)? (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/ (?: [^/]+/(?P<type>video|audio)/[^?]+| popup(?P<type_2>video|audio)player ) \?.*?\b(?:id=|urn=urn:[^:]+:video:)(?P<id>[0-9a-f\-]{36}|\d+) ''' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', 'timestamp': 1372708215, 'duration': 113.827, 'thumbnail': r're:^https?://.*1383719781\.png$', }, 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', 'timestamp': 1444709160, 'duration': 336.816, }, 'params': { # rtmp download 'skip_download': True, }, }, { 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', 'info_dict': { 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', 'duration': 1796.76, 'title': 'Le 19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', }, 'params': { # m3u8 download 'skip_download': True, } }, { 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', 'info_dict': { 'id': '42960270', 'ext': 'mp4', 'title': 'Why people were against tax reforms', 'description': 'md5:7ac442c558e9630e947427469c4b824d', 'duration': 94.0, 'upload_date': '20170215', 'timestamp': 1487173560, 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', 'subtitles': 'count:9', }, 'params': { 'skip_download': True, } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, }, { 'url': 'https://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?urn=urn:srf:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'only_matching': True, }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, }, { # audio segment, has podcastSdUrl of the full episode 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') ================================================ FILE: youtube_dl/extractor/srmediathek.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, get_element_by_attribute, ) class SRMediathekIE(ARDMediathekBaseIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', 'info_dict': { 'id': '28455', 'ext': 'mp4', 'title': 'sportarena (26.10.2014)', 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'no longer available', }, { 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', 'info_dict': { 'id': '37682', 'ext': 'mp4', 'title': 'Love, Cakes and Rock\'n\'Roll', 'description': 'md5:18bf9763631c7d326c22603681e1123d', }, 'params': { # m3u8 download 'skip_download': True, }, }, { 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: raise ExtractorError('Video %s is no longer available' % video_id, expected=True) media_collection_url = self._search_regex( r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') info = self._extract_media_info(media_collection_url, webpage, video_id) info.update({ 'id': video_id, 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), }) return info ================================================ FILE: youtube_dl/extractor/stanfordoc.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, orderedSet, unescapeHTML, ) class StanfordOpenClassroomIE(InfoExtractor): IE_NAME = 'stanfordoc' IE_DESC = 'Stanford Open ClassRoom' _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' _TEST = { 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 'md5': '544a9468546059d4e80d76265b0443b8', 'info_dict': { 'id': 'PracticalUnix_intro-environment', 'ext': 'mp4', 'title': 'Intro Environment', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('course') and mobj.group('video'): # A specific video course = mobj.group('course') video = mobj.group('video') info = { 'id': course + '_' + video, 'uploader': None, 'upload_date': None, } baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' xmlUrl = baseUrl + video + '.xml' mdoc = self._download_xml(xmlUrl, info['id']) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: raise ExtractorError('Invalid metadata XML file') return info elif mobj.group('course'): # A course page course = mobj.group('course') info = { 'id': course, '_type': 'playlist', 'uploader': None, 'upload_date': None, } coursepage = self._download_webpage( url, info['id'], note='Downloading course info page', errnote='Unable to download course info page') info['title'] = self._html_search_regex( r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) info['description'] = self._html_search_regex( r'(?s)<description>([^<]+)</description>', coursepage, 'description', fatal=False) links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] return info else: # Root page info = { 'id': 'Stanford OpenClassroom', '_type': 'playlist', 'uploader': None, 'upload_date': None, } info['title'] = info['id'] rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' rootpage = self._download_webpage(rootURL, info['id'], errnote='Unable to download course info page') links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) info['entries'] = [self.url_result( 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) ) for l in links] return info ================================================ FILE: youtube_dl/extractor/steam.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( extract_attributes, ExtractorError, get_element_by_class, js_to_json, ) class SteamIE(InfoExtractor): _VALID_URL = r"""(?x) https?://store\.steampowered\.com/ (agecheck/)? (?P<urltype>video|app)/ #If the page is only for videos or for a game (?P<gameID>\d+)/? (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID | https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+) """ _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' _TESTS = [{ 'url': 'http://store.steampowered.com/video/105600/', 'playlist': [ { 'md5': '6a294ee0c4b1f47f5bb76a65e31e3592', 'info_dict': { 'id': '2040428', 'ext': 'mp4', 'title': 'Terraria 1.3 Trailer', 'playlist_index': 1, } }, { 'md5': '911672b20064ca3263fa89650ba5a7aa', 'info_dict': { 'id': '2029566', 'ext': 'mp4', 'title': 'Terraria 1.2 Trailer', 'playlist_index': 2, } } ], 'info_dict': { 'id': '105600', 'title': 'Terraria', }, 'params': { 'playlistend': 2, } }, { 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', 'info_dict': { 'id': 'X8kpJBlzD2E', 'ext': 'mp4', 'upload_date': '20140617', 'title': 'FRONTIERS - Trapping', 'description': 'md5:bf6f7f773def614054089e5769c12a6e', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) fileID = m.group('fileID') if fileID: videourl = url playlist_id = fileID else: gameID = m.group('gameID') playlist_id = gameID videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id self._set_cookie('steampowered.com', 'mature_content', '1') webpage = self._download_webpage(videourl, playlist_id) if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None: videourl = self._AGECHECK_TEMPLATE % playlist_id self.report_age_confirmation() webpage = self._download_webpage(videourl, playlist_id) flash_vars = self._parse_json(self._search_regex( r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage, 'flash vars'), playlist_id, js_to_json) playlist_title = None entries = [] if fileID: playlist_title = get_element_by_class('workshopItemTitle', webpage) for movie in flash_vars.values(): if not movie: continue youtube_id = movie.get('YOUTUBE_VIDEO_ID') if not youtube_id: continue entries.append({ '_type': 'url', 'url': youtube_id, 'ie_key': 'Youtube', }) else: playlist_title = get_element_by_class('apphub_AppName', webpage) for movie_id, movie in flash_vars.items(): if not movie: continue video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False) title = movie.get('MOVIE_NAME') if not title or not video_id: continue entry = { 'id': video_id, 'title': title.replace('+', ' '), } formats = [] flv_url = movie.get('FILENAME') if flv_url: formats.append({ 'format_id': 'flv', 'url': flv_url, }) highlight_element = self._search_regex( r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id, webpage, 'highlight element', fatal=False) if highlight_element: highlight_attribs = extract_attributes(highlight_element) if highlight_attribs: entry['thumbnail'] = highlight_attribs.get('data-poster') for quality in ('', '-hd'): for ext in ('webm', 'mp4'): video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality)) if video_url: formats.append({ 'format_id': ext + quality, 'url': video_url, }) if not formats: continue entry['formats'] = formats entries.append(entry) if not entries: raise ExtractorError('Could not find any videos') return self.playlist_result(entries, playlist_id, playlist_title) ================================================ FILE: youtube_dl/extractor/stitcher.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, clean_podcast_url, ExtractorError, int_or_none, str_or_none, try_get, url_or_none, ) class StitcherBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' def _call_api(self, path, video_id, query): resp = self._download_json( 'https://api.prod.stitcher.com/' + path, video_id, query=query) error_massage = try_get(resp, lambda x: x['errors'][0]['message']) if error_massage: raise ExtractorError(error_massage, expected=True) return resp['data'] def _extract_description(self, data): return clean_html(data.get('html_description') or data.get('description')) def _extract_audio_url(self, episode): return url_or_none(episode.get('audio_url') or episode.get('guid')) def _extract_show_info(self, show): return { 'thumbnail': show.get('image_base_url'), 'series': show.get('title'), } def _extract_episode(self, episode, audio_url, show_info): info = { 'id': compat_str(episode['id']), 'display_id': episode.get('slug'), 'title': episode['title'].strip(), 'description': self._extract_description(episode), 'duration': int_or_none(episode.get('duration')), 'url': clean_podcast_url(audio_url), 'vcodec': 'none', 'timestamp': int_or_none(episode.get('date_published')), 'season_number': int_or_none(episode.get('season')), 'season_id': str_or_none(episode.get('season_id')), } info.update(show_info) return info class StitcherIE(StitcherBaseIE): _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20151008', 'timestamp': 1444285800, 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', 'info_dict': { 'id': '40846275', 'display_id': 'the-rare-hourlong-comedy-plus', 'ext': 'mp3', 'title': "The CW's 'Crazy Ex-Girlfriend'", 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17', 'duration': 2235, 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': True, }, 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', 'only_matching': True, }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, }, { 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', 'only_matching': True, }] def _real_extract(self, url): audio_id = self._match_id(url) data = self._call_api( 'shows/episodes', audio_id, {'episode_ids': audio_id}) episode = data['episodes'][0] audio_url = self._extract_audio_url(episode) if not audio_url: self.raise_login_required() show = try_get(data, lambda x: x['shows'][0], dict) or {} return self._extract_episode( episode, audio_url, self._extract_show_info(show)) class StitcherShowIE(StitcherBaseIE): _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines', 'info_dict': { 'id': 'the-talking-machines', 'title': 'Talking Machines', 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', }, 'playlist_mincount': 106, }, { 'url': 'https://www.stitcher.com/show/the-talking-machines', 'only_matching': True, }] def _real_extract(self, url): show_slug = self._match_id(url) data = self._call_api( 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) show = try_get(data, lambda x: x['shows'][0], dict) or {} show_info = self._extract_show_info(show) entries = [] for episode in (data.get('episodes') or []): audio_url = self._extract_audio_url(episode) if not audio_url: continue entries.append(self._extract_episode(episode, audio_url, show_info)) return self.playlist_result( entries, show_slug, show.get('title'), self._extract_description(show)) ================================================ FILE: youtube_dl/extractor/storyfire.py ================================================ # coding: utf-8 from __future__ import unicode_literals import functools from .common import InfoExtractor from ..utils import ( # HEADRequest, int_or_none, OnDemandPagedList, smuggle_url, ) class StoryFireBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' def _call_api(self, path, video_id, resource, query=None): return self._download_json( 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, 'Downloading %s JSON metadata' % resource, query=query) def _parse_video(self, video): title = video['title'] vimeo_id = self._search_regex( r'https?://player\.vimeo\.com/external/(\d+)', video['vimeoVideoURL'], 'vimeo id') # video_url = self._request_webpage( # HEADRequest(video['vimeoVideoURL']), video_id).geturl() # formats = [] # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: # formats.extend(self._extract_m3u8_formats( # v_url, video_id, 'mp4', 'm3u8_native', # m3u8_id='hls' + suffix, fatal=False)) # formats.extend(self._extract_mpd_formats( # v_url.replace('.m3u8', '.mpd'), video_id, # mpd_id='dash' + suffix, fatal=False)) # self._sort_formats(formats) uploader_id = video.get('hostID') return { '_type': 'url_transparent', 'id': vimeo_id, 'title': title, 'description': video.get('description'), 'url': smuggle_url( 'https://player.vimeo.com/video/' + vimeo_id, { 'http_headers': { 'Referer': 'https://storyfire.com/', } }), # 'formats': formats, 'thumbnail': video.get('storyImage'), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likesCount')), 'comment_count': int_or_none(video.get('commentsCount')), 'duration': int_or_none(video.get('videoDuration')), 'timestamp': int_or_none(video.get('publishDate')), 'uploader': video.get('username'), 'uploader_id': uploader_id, 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), } class StoryFireIE(StoryFireBaseIE): _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})' _TEST = { 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', 'md5': 'caec54b9e4621186d6079c7ec100c1eb', 'info_dict': { 'id': '378954662', 'ext': 'mp4', 'title': 'Buzzfeed Teaches You About Memes', 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', 'timestamp': 1576129028, 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', 'uploader': 'whang!', 'upload_date': '20191212', 'duration': 418, 'view_count': int, 'like_count': int, 'comment_count': int, }, 'params': { 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'] } def _real_extract(self, url): video_id = self._match_id(url) video = self._call_api( 'generic/video-detail', video_id, 'video')['video'] return self._parse_video(video) class StoryFireUserIE(StoryFireBaseIE): _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video' _TEST = { 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', 'info_dict': { 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', }, 'playlist_mincount': 151, } _PAGE_SIZE = 20 def _fetch_page(self, user_id, page): videos = self._call_api( 'publicVideos', user_id, 'page %d' % (page + 1), { 'skip': page * self._PAGE_SIZE, })['videos'] for video in videos: yield self._parse_video(video) def _real_extract(self, url): user_id = self._match_id(url) entries = OnDemandPagedList(functools.partial( self._fetch_page, user_id), self._PAGE_SIZE) return self.playlist_result(entries, user_id) class StoryFireSeriesIE(StoryFireBaseIE): _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', 'info_dict': { 'id': '-Lq6MsuIHLODO6d2dDkr', }, 'playlist_mincount': 13, }, { 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', 'info_dict': { 'id': 'the_mortal_one', }, 'playlist_count': 0, }] def _extract_videos(self, stories): for story in stories.values(): if story.get('hasVideo'): yield self._parse_video(story) def _real_extract(self, url): series_id = self._match_id(url) stories = self._call_api( 'seriesStories', series_id, 'series stories') return self.playlist_result(self._extract_videos(stories), series_id) ================================================ FILE: youtube_dl/extractor/streamable.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, ) class StreamableIE(InfoExtractor): _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' _TESTS = [ { 'url': 'https://streamable.com/dnd1', 'md5': '3e3bc5ca088b48c2d436529b64397fef', 'info_dict': { 'id': 'dnd1', 'ext': 'mp4', 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', 'thumbnail': r're:https?://.*\.jpg$', 'uploader': 'teabaker', 'timestamp': 1454964157.35115, 'upload_date': '20160208', 'duration': 61.516, 'view_count': int, } }, # older video without bitrate, width/height, etc. info { 'url': 'https://streamable.com/moo', 'md5': '2cf6923639b87fba3279ad0df3a64e73', 'info_dict': { 'id': 'moo', 'ext': 'mp4', 'title': '"Please don\'t eat me!"', 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1426115495, 'upload_date': '20150311', 'duration': 12, 'view_count': int, } }, { 'url': 'https://streamable.com/e/dnd1', 'only_matching': True, }, { 'url': 'https://streamable.com/s/okkqk/drxjds', 'only_matching': True, } ] @staticmethod def _extract_url(webpage): mobj = re.search( r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', webpage) if mobj: return mobj.group('src') def _real_extract(self, url): video_id = self._match_id(url) # Note: Using the ajax API, as the public Streamable API doesn't seem # to return video info like the title properly sometimes, and doesn't # include info like the video duration video = self._download_json( 'https://ajax.streamable.com/videos/%s' % video_id, video_id) # Format IDs: # 0 The video is being uploaded # 1 The video is being processed # 2 The video has at least one file ready # 3 The video is unavailable due to an error status = video.get('status') if status != 2: raise ExtractorError( 'This video is currently unavailable. It may still be uploading or processing.', expected=True) title = video.get('reddit_title') or video['title'] formats = [] for key, info in video['files'].items(): if not info.get('url'): continue formats.append({ 'format_id': key, 'url': self._proto_relative_url(info['url']), 'width': int_or_none(info.get('width')), 'height': int_or_none(info.get('height')), 'filesize': int_or_none(info.get('size')), 'fps': int_or_none(info.get('framerate')), 'vbr': float_or_none(info.get('bitrate'), 1000) }) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': video.get('description'), 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), 'uploader': video.get('owner', {}).get('user_name'), 'timestamp': float_or_none(video.get('date_added')), 'duration': float_or_none(video.get('duration')), 'view_count': int_or_none(video.get('plays')), 'formats': formats } ================================================ FILE: youtube_dl/extractor/streamcloud.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( ExtractorError, urlencode_postdata, ) class StreamcloudIE(InfoExtractor): IE_NAME = 'streamcloud.eu' _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?' _TESTS = [{ 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', 'md5': '6bea4c7fa5daaacc2a946b7146286686', 'info_dict': { 'id': 'skp9j99s4bpz', 'ext': 'mp4', 'title': 'youtube-dl test video \'/\\ ä ↭', }, 'skip': 'Only available from the EU' }, { 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) url = 'http://streamcloud.eu/%s' % video_id orig_webpage = self._download_webpage(url, video_id) if '>File Not Found<' in orig_webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) fields = re.findall(r'''(?x)<input\s+ type="(?:hidden|submit)"\s+ name="([^"]+)"\s+ (?:id="[^"]+"\s+)? value="([^"]*)" ''', orig_webpage) self._sleep(6, video_id) webpage = self._download_webpage( url, video_id, data=urlencode_postdata(fields), headers={ b'Content-Type': b'application/x-www-form-urlencoded', }) try: title = self._html_search_regex( r'<h1[^>]*>([^<]+)<', webpage, 'title') video_url = self._search_regex( r'file:\s*"([^"]+)"', webpage, 'video URL') except ExtractorError: message = self._html_search_regex( r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', webpage, 'message', default=None, group='message') if message: raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) raise thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { 'Referer': url, }, } ================================================ FILE: youtube_dl/extractor/streamcz.py ================================================ # coding: utf-8 from __future__ import unicode_literals import json import re from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, merge_dicts, parse_codecs, urljoin, ) class StreamCZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P<display_id>[^?#]+)-(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890', 'md5': '40c41ade1464a390a0b447e333df4239', 'info_dict': { 'id': '57953890', 'ext': 'mp4', 'title': 'Bůh', 'display_id': 'buh', 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', 'duration': 1369.6, 'view_count': int, } }, { 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', 'md5': '41fd358000086a1ccdb068c77809b158', 'info_dict': { 'id': '64087937', 'ext': 'mp4', 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', 'duration': 50.2, 'view_count': int, } }, { 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', 'md5': '3ee4d0be040e8f4a543e67e509d55e3f', 'info_dict': { 'id': '64147267', 'ext': 'mp4', 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', 'duration': 442.84, 'view_count': int, } }] def _extract_formats(self, spl_url, video): for ext, pref, streams in ( ('ts', -1, video.get('http_stream', {}).get('qualities', {})), ('mp4', 1, video.get('mp4'))): for format_id, stream in streams.items(): if not stream.get('url'): continue yield merge_dicts({ 'format_id': '-'.join((format_id, ext)), 'ext': ext, 'source_preference': pref, 'url': urljoin(spl_url, stream['url']), 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), 'duration': float_or_none(stream.get('duration'), scale=1000), 'width': stream.get('resolution', 2 * [0])[0] or None, 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), }, parse_codecs(stream.get('codec'))) def _real_extract(self, url): display_id, video_id = re.match(self._VALID_URL, url).groups() data = self._download_json( 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', data=json.dumps({ 'variables': {'urlName': video_id}, 'query': ''' query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } } fragment VideoDetailFragmentOnEpisode on Episode { id spl urlName name perex duration views }''' }).encode('utf-8'), headers={'Content-Type': 'application/json;charset=UTF-8'} )['data']['episode'] spl_url = data['spl'] + 'spl2,3' metadata = self._download_json(spl_url, video_id, 'Downloading playlist') if 'Location' in metadata and 'data' not in metadata: spl_url = metadata['Location'] metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist') video = metadata['data'] subtitles = {} for subs in video.get('subtitles', {}).values(): if not subs.get('language'): continue for ext, sub_url in subs.get('urls').items(): subtitles.setdefault(subs['language'], []).append({ 'ext': ext, 'url': urljoin(spl_url, sub_url) }) formats = list(self._extract_formats(spl_url, video)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': data.get('name'), 'description': data.get('perex'), 'duration': float_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), 'formats': formats, 'subtitles': subtitles, } ================================================ FILE: youtube_dl/extractor/streamsb.py ================================================ # coding: utf-8 from __future__ import unicode_literals import binascii import random import re import string from .common import InfoExtractor from ..utils import urljoin, url_basename def to_ascii_hex(str1): return binascii.hexlify(str1.encode('utf-8')).decode('ascii') def generate_random_string(length): return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length)) class StreamsbIE(InfoExtractor): _DOMAINS = ('viewsb.com', ) _VALID_URL = r'https://(?P<domain>%s)/(?P<id>.+)' % '|'.join(_DOMAINS) _TEST = { 'url': 'https://viewsb.com/dxfvlu4qanjx', 'md5': '488d111a63415369bf90ea83adc8a325', 'info_dict': { 'id': 'dxfvlu4qanjx', 'ext': 'mp4', 'title': 'Sintel' } } def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id') webpage = self._download_webpage(url, video_id) iframe_rel_url = self._search_regex(r'''(?i)<iframe\b[^>]+\bsrc\s*=\s*('|")(?P<path>/.*\.html)\1''', webpage, 'iframe', group='path') iframe_url = urljoin('https://' + domain, iframe_rel_url) iframe_data = self._download_webpage(iframe_url, video_id) app_version = self._search_regex(r'''<script\b[^>]+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50' video_code = url_basename(iframe_url).rsplit('.')[0] length = 12 req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb')) ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req)) video_data = self._download_webpage(ereq, video_id, headers={ 'Referer': iframe_url, 'watchsb': 'sbstream', }) player_data = self._parse_json(video_data, video_id) title = player_data['stream_data']['title'] formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) return { 'id': video_id, 'formats': formats, 'title': title, } ================================================ FILE: youtube_dl/extractor/streetvoice.py ================================================ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, str_or_none, strip_or_none, try_get, urljoin, ) class StreetVoiceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://streetvoice.com/skippylu/songs/123688/', 'md5': '0eb535970629a5195685355f3ed60bfd', 'info_dict': { 'id': '123688', 'ext': 'mp3', 'title': '流浪', 'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 270, 'upload_date': '20100923', 'uploader': 'Crispy脆樂團', 'uploader_id': '627810', 'uploader_url': 're:^https?://streetvoice.com/skippylu/', 'timestamp': 1285261661, 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, 'track': '流浪', 'track_id': '123688', 'album': '2010', } }, { 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', 'only_matching': True, }] def _real_extract(self, url): song_id = self._match_id(url) base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id song = self._download_json(base_url, song_id, query={ 'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username', }) title = song['name'] formats = [] for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]: f_url = (self._download_json( base_url + suffix + '/', song_id, 'Downloading %s format URL' % format_id, data=b'', fatal=False) or {}).get('file') if not f_url: continue f = { 'ext': 'mp3', 'format_id': format_id, 'url': f_url, 'vcodec': 'none', } if format_id == 'hls': f['protocol'] = 'm3u8_native' abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None) if abr: abr = int(abr) f.update({ 'abr': abr, 'tbr': abr, }) formats.append(f) user = song.get('user') or {} username = user.get('username') get_count = lambda x: int_or_none(song.get(x + '_count')) return { 'id': song_id, 'formats': formats, 'title': title, 'description': strip_or_none(song.get('synopsis')), 'thumbnail': song.get('image'), 'duration': int_or_none(song.get('length')), 'timestamp': parse_iso8601(song.get('created_at')), 'uploader': try_get(user, lambda x: x['profile']['nickname']), 'uploader_id': str_or_none(user.get('id')), 'uploader_url': urljoin(url, '/%s/' % username) if username else None, 'view_count': get_count('plays'), 'like_count': get_count('likes'), 'comment_count': get_count('comments'), 'repost_count': get_count('share'), 'track': title, 'track_id': song_id, 'album': try_get(song, lambda x: x['album']['name']), } ================================================ FILE: youtube_dl/extractor/stretchinternet.py ================================================ from __future__ import unicode_literals from .common import InfoExtractor class StretchInternetIE(InfoExtractor): _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)' _TEST = { 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', 'info_dict': { 'id': '573272', 'ext': 'mp4', 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA', # 'timestamp': 1575668361, # 'upload_date': '20191206', 'uploader_id': '99997', } } def _real_extract(self, url): video_id = self._match_id(url) media_url = self._download_json( 'https://core.stretchlive.com/trinity/event/tcg/' + video_id, video_id)[0]['media'][0]['url'] event = self._download_json( 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', video_id, query={'eventID': video_id, 'token': 'asdf'})['event'] return { 'id': video_id, 'title': event['title'], # TODO: parse US timezone abbreviations # 'timestamp': event.get('dateTimeString'), 'url': 'https://' + media_url, 'uploader_id': event.get('ownerID'), } ================================================ FILE: youtube_dl/extractor/stv.py ================================================ # coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( compat_str, float_or_none, int_or_none, smuggle_url, str_or_none, try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' _TESTS = [{ # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { 'id': '5333973339001', 'ext': 'mp4', 'upload_date': '20170301', 'title': '60 seconds on set with Laura Norton', 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!", 'timestamp': 1488388054, 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', }, { # episodes 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', 'video': 'shortform', } def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' props = (self._parse_json(self._search_regex( r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', webpage, 'next data', default='{}'), video_id, fatal=False) or {}).get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} api_path, resp = None, {} for k, v in player_api_cache.items(): if k.startswith('/episodes/') or k.startswith('/shortform/'): api_path, resp = k, v break else: episode_id = str_or_none(try_get( props, lambda x: x['pageProps']['episodeId'])) api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) result = resp.get('results') if not result: resp = self._download_json( 'https://player.api.stv.tv/v1' + api_path, video_id) result = resp['results'] video = result['video'] video_id = compat_str(video['id']) subtitles = {} _subtitles = result.get('_subtitles') or {} for ext, sub_url in _subtitles.items(): subtitles.setdefault('en', []).append({ 'ext': 'vtt' if ext == 'webvtt' else ext, 'url': sub_url, }) programme = result.get('programme') or {} return { '_type': 'url_transparent', 'id': video_id, 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, 'view_count': int_or_none(result.get('views')), 'series': programme.get('name') or programme.get('shortName'), 'ie_key': 'BrightcoveNew', } ================================================ FILE: youtube_dl/extractor/sunporno.py ================================================ from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, qualities, determine_ext, ) class SunPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } }, { 'url': 'http://embeds.sunporno.com/embed/807778', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'class="views">(?: