download all collections in category

delete stuff
don't download if file is already there
2019-01-14 13:31:14 -05:00 · 2019-01-14 12:48:03 -05:00 · 2019-01-14 12:47:03 -05:00 · 2019-01-14 12:34:19 -05:00 · 2019-01-14 11:28:58 -05:00 · 2019-01-14 11:28:29 -05:00
6 changed files with 207 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,5 @@
 env/
 files/
 __pycache__/
 *.tar.gz
 *.log
--- a/BIN
+++ b/BIN
--- a/main.py
+++ b/main.py
@ -0,0 +1,6 @@
 from pir_connector import *
 # getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/happymusic")
 # getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/Corporate-Grooves-Vol-2")
 getCategory("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music")
--- a/main_example.py
+++ b/main_example.py
@ -0,0 +1,108 @@
 import requests
 import json
 import os
 from bs4 import BeautifulSoup
 baseurl = "https://bvtlab.com"
 phpsessid = "lps4od32b4kgibtgd1440df7i5"
 sid = "b88e922dd251e1f6"
 time = "6535904988"
 reqcookies = dict(PHPSESSID=phpsessid)
 def getImageName(imageURL):
    result = ""
    for char in imageURL:
        result += char
        if char == "/":
            result = ""
    return result
 def downloadImage(imageURL, outDir, outFile):
    i = requests.get(imageURL)
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    with open(os.path.join(outDir, outFile), 'wb') as tempFile:
        tempFile.write(i.content)
 def getSegments():
    payload = {
        "action": "getsegments",
        "sid": sid,
    }
    r = requests.get(baseurl + "/ebook.php", params=payload, cookies=reqcookies)
    items = r.json()
    result = []
    for i in range(0, len(items)):
        if items[i]["Section"] == None:
            #print(i+1, items[i]["Title"], items[i]["Short_Title"])
            result.append([i+1, items[i]["Level1"], items[i]["Segment_Depth"], items[i]["Segment_Level"], items[i]["Line_Item"], items[i]["Short_Title"], items[i]["Title"]])
    return result
 def getPages(seg, lev, seg_dep, seg_lev, li):
    payload = {
        "action": "getpages",
        "sid": sid,
        "segment": seg,
        "level": lev,
        "segment_depth": seg_dep,
        "segment_level": seg_lev,
        "line_item": li,
        "width": 550
    }
    r = requests.get(baseurl + "/ebook.php", params=payload, cookies=reqcookies)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup.find_all("div")
 def ripPages(divlist, dir, pdf):
    for d in divlist:
        pagenum = d["data-page"]
        imgs = d.find_all("img")
        print(pagenum, "----", d)
        width = 1100
        height = 1491
        stichedIm = Image.new('RGB', (width, height))
        ih = 0
        for im in imgs:
            print(im)
            imurl = im["data-src"]
            downloadImage(imurl, dir + "/page_" + pagenum, im["id"] + ".jpg")
            im = Image.open(os.path.join(dir + "/page_" + pagenum, im["id"] + ".jpg"))
            print(im.size)
            #height += im.size[1]
            stichedIm.paste(im, (0, im.size[1] * ih))
            ih += 1
        stichedIm.save(dir + "/page_"+ pagenum + ".jpg")
        pdf.add_page()
        pdf.image(dir + "/page_"+ pagenum + ".jpg", 0, 0)
        print("the height:", width, height)
 chapters = getSegments()
 pdf = FPDF(unit = "pt", format = [1100, 1491])
 for x in range(0, len(chapters)):
    s = getPages(chapters[x][0], chapters[x][1], chapters[x][2], chapters[x][3], chapters[x][4])
    outdir = chapters[x][5] + "_" + chapters[x][6]
    ripPages(s, outdir, pdf)
 pdf.output("ebook.pdf", "F")
--- a/pir_connector.py
+++ b/pir_connector.py
@ -0,0 +1,81 @@
 import requests
 import json
 import os
 import re
 import time
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support.expected_conditions import presence_of_element_located
 from selenium.webdriver.firefox.options import Options
 options = Options()
 options.headless = True
 base_url = "https://www.partnersinrhyme.com/royaltyfreemusic"
 base_files_url = "https://www.partnersinrhyme.com/files/"
 base_out_url = "files/PartnersInRhyme/"
 def getCategory(category_url):
    r = requests.get(category_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    category_name = category_url.split("/")[-1]
    links = set();
    for li in soup.find_all(href=re.compile(category_name+ "/")):
        links.add(li['href'])
    for lk in links:
        print(lk)
        getCollection(lk, category_name)
 def getCollection(collection_url, category_name):
    r = requests.get(collection_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    player = soup.iframe["src"];
    p = requests.get("http:" + player);
    with webdriver.Firefox(options=options) as driver:
        driver.get("http:" + player)
        el = WebDriverWait(driver, 60).until(f)
        time.sleep(2)
        psoup = BeautifulSoup(driver.page_source, 'html.parser')
        # print(psoup)
        driver.quit()
        for li in psoup.find_all("li"):
            try:
                print("downloading...", li.attrs['data-mp3'])
                downloadSong(base_files_url + li.attrs['data-mp3'], category_name)
            except KeyError:
                print("Could not load", collection_url.split("/")[-1], "possibly has hiearchy")
 def downloadSong(song_url, category_name):
    (collection_name, outFile) = getSongFromURL(song_url);
    outDir = base_out_url + category_name + "/" + collection_name
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.isfile(os.path.join(outDir, outFile)):
        i = requests.get(song_url)
        with open(os.path.join(outDir, outFile), 'wb') as tempFile:
            tempFile.write(i.content)
    else:
        print("File", outFile, "already exists... skipping")
    print("")
 def getSongFromURL(song_url):
    list = song_url.split("/")
    return (list[4], list[-1])
 def f(d):
    return d.find_element_by_class_name("listContainer")
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 beautifulsoup4==4.7.1
 certifi==2018.11.29
 chardet==3.0.4
 idna==2.8
 requests==2.21.0
 selenium==3.141.0
 soupsieve==1.7.1
 urllib3==1.24.1
Author	SHA1	Message	Date
Alexander Matson	4bebd4b7b4	download all collections in category	2019-01-14 13:31:14 -05:00
Alexander Matson	c2a9ae0ca2	delete stuff	2019-01-14 12:48:03 -05:00
Alexander Matson	e45f0129c3	don't download if file is already there	2019-01-14 12:47:03 -05:00
Alexander Matson	0f77e6a69a	working album fetching	2019-01-14 12:34:19 -05:00
Alexander Matson	dafaeb32a2	add firefox driver	2019-01-14 11:28:58 -05:00
Alexander Matson	f2517e4a1d	add selenium	2019-01-14 11:28:29 -05:00
Alexander Matson	17803eb993	ignore cache	2019-01-14 11:03:59 -05:00
Alexander Matson	534e328296	create song downloader	2019-01-14 11:03:40 -05:00
Alexander Matson	9778ad38c2	add requirements	2019-01-14 11:03:30 -05:00
Alexander Matson	672e708d44	add project	2019-01-14 10:47:15 -05:00