download all collections in category

This commit is contained in:
Alexander Matson 2019-01-14 13:31:14 -05:00
parent c2a9ae0ca2
commit 4bebd4b7b4
2 changed files with 30 additions and 6 deletions

View File

@ -1,3 +1,6 @@
from pir_connector import * from pir_connector import *
getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/happymusic") # getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/happymusic")
# getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/Corporate-Grooves-Vol-2")
getCategory("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music")

View File

@ -1,6 +1,7 @@
import requests import requests
import json import json
import os import os
import re
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from selenium import webdriver from selenium import webdriver
@ -17,7 +18,22 @@ base_url = "https://www.partnersinrhyme.com/royaltyfreemusic"
base_files_url = "https://www.partnersinrhyme.com/files/" base_files_url = "https://www.partnersinrhyme.com/files/"
base_out_url = "files/PartnersInRhyme/" base_out_url = "files/PartnersInRhyme/"
def getCollection(collection_url): def getCategory(category_url):
r = requests.get(category_url)
soup = BeautifulSoup(r.text, 'html.parser')
category_name = category_url.split("/")[-1]
links = set();
for li in soup.find_all(href=re.compile(category_name+ "/")):
links.add(li['href'])
for lk in links:
print(lk)
getCollection(lk, category_name)
def getCollection(collection_url, category_name):
r = requests.get(collection_url) r = requests.get(collection_url)
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
player = soup.iframe["src"]; player = soup.iframe["src"];
@ -31,15 +47,19 @@ def getCollection(collection_url):
time.sleep(2) time.sleep(2)
psoup = BeautifulSoup(driver.page_source, 'html.parser') psoup = BeautifulSoup(driver.page_source, 'html.parser')
# print(psoup)
driver.quit() driver.quit()
for li in psoup.find_all("li"): for li in psoup.find_all("li"):
try:
print("downloading...", li.attrs['data-mp3']) print("downloading...", li.attrs['data-mp3'])
downloadSong(base_files_url + li.attrs['data-mp3']) downloadSong(base_files_url + li.attrs['data-mp3'], category_name)
except KeyError:
print("Could not load", collection_url.split("/")[-1], "possibly has hiearchy")
def downloadSong(song_url): def downloadSong(song_url, category_name):
(collection_name, outFile) = getSongFromURL(song_url); (collection_name, outFile) = getSongFromURL(song_url);
outDir = base_out_url + collection_name outDir = base_out_url + category_name + "/" + collection_name
if not os.path.exists(outDir): if not os.path.exists(outDir):
os.makedirs(outDir) os.makedirs(outDir)
@ -51,6 +71,7 @@ def downloadSong(song_url):
tempFile.write(i.content) tempFile.write(i.content)
else: else:
print("File", outFile, "already exists... skipping") print("File", outFile, "already exists... skipping")
print("")
def getSongFromURL(song_url): def getSongFromURL(song_url):
list = song_url.split("/") list = song_url.split("/")