Compare commits

..

10 Commits

6 changed files with 207 additions and 0 deletions

4
.gitignore vendored
View File

@ -1 +1,5 @@
env/
files/
__pycache__/
*.tar.gz
*.log

BIN
geckodriver Executable file

Binary file not shown.

6
main.py Normal file
View File

@ -0,0 +1,6 @@
from pir_connector import *
# getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/happymusic")
# getCollection("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music/Corporate-Grooves-Vol-2")
getCategory("https://www.partnersinrhyme.com/royaltyfreemusic/Corporate-Music-and-Motivational-Music")

108
main_example.py Normal file
View File

@ -0,0 +1,108 @@
import requests
import json
import os
from bs4 import BeautifulSoup
baseurl = "https://bvtlab.com"
phpsessid = "lps4od32b4kgibtgd1440df7i5"
sid = "b88e922dd251e1f6"
time = "6535904988"
reqcookies = dict(PHPSESSID=phpsessid)
def getImageName(imageURL):
result = ""
for char in imageURL:
result += char
if char == "/":
result = ""
return result
def downloadImage(imageURL, outDir, outFile):
i = requests.get(imageURL)
if not os.path.exists(outDir):
os.makedirs(outDir)
with open(os.path.join(outDir, outFile), 'wb') as tempFile:
tempFile.write(i.content)
def getSegments():
payload = {
"action": "getsegments",
"sid": sid,
}
r = requests.get(baseurl + "/ebook.php", params=payload, cookies=reqcookies)
items = r.json()
result = []
for i in range(0, len(items)):
if items[i]["Section"] == None:
#print(i+1, items[i]["Title"], items[i]["Short_Title"])
result.append([i+1, items[i]["Level1"], items[i]["Segment_Depth"], items[i]["Segment_Level"], items[i]["Line_Item"], items[i]["Short_Title"], items[i]["Title"]])
return result
def getPages(seg, lev, seg_dep, seg_lev, li):
payload = {
"action": "getpages",
"sid": sid,
"segment": seg,
"level": lev,
"segment_depth": seg_dep,
"segment_level": seg_lev,
"line_item": li,
"width": 550
}
r = requests.get(baseurl + "/ebook.php", params=payload, cookies=reqcookies)
soup = BeautifulSoup(r.text, 'html.parser')
return soup.find_all("div")
def ripPages(divlist, dir, pdf):
for d in divlist:
pagenum = d["data-page"]
imgs = d.find_all("img")
print(pagenum, "----", d)
width = 1100
height = 1491
stichedIm = Image.new('RGB', (width, height))
ih = 0
for im in imgs:
print(im)
imurl = im["data-src"]
downloadImage(imurl, dir + "/page_" + pagenum, im["id"] + ".jpg")
im = Image.open(os.path.join(dir + "/page_" + pagenum, im["id"] + ".jpg"))
print(im.size)
#height += im.size[1]
stichedIm.paste(im, (0, im.size[1] * ih))
ih += 1
stichedIm.save(dir + "/page_"+ pagenum + ".jpg")
pdf.add_page()
pdf.image(dir + "/page_"+ pagenum + ".jpg", 0, 0)
print("the height:", width, height)
chapters = getSegments()
pdf = FPDF(unit = "pt", format = [1100, 1491])
for x in range(0, len(chapters)):
s = getPages(chapters[x][0], chapters[x][1], chapters[x][2], chapters[x][3], chapters[x][4])
outdir = chapters[x][5] + "_" + chapters[x][6]
ripPages(s, outdir, pdf)
pdf.output("ebook.pdf", "F")

81
pir_connector.py Normal file
View File

@ -0,0 +1,81 @@
import requests
import json
import os
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
base_url = "https://www.partnersinrhyme.com/royaltyfreemusic"
base_files_url = "https://www.partnersinrhyme.com/files/"
base_out_url = "files/PartnersInRhyme/"
def getCategory(category_url):
r = requests.get(category_url)
soup = BeautifulSoup(r.text, 'html.parser')
category_name = category_url.split("/")[-1]
links = set();
for li in soup.find_all(href=re.compile(category_name+ "/")):
links.add(li['href'])
for lk in links:
print(lk)
getCollection(lk, category_name)
def getCollection(collection_url, category_name):
r = requests.get(collection_url)
soup = BeautifulSoup(r.text, 'html.parser')
player = soup.iframe["src"];
p = requests.get("http:" + player);
with webdriver.Firefox(options=options) as driver:
driver.get("http:" + player)
el = WebDriverWait(driver, 60).until(f)
time.sleep(2)
psoup = BeautifulSoup(driver.page_source, 'html.parser')
# print(psoup)
driver.quit()
for li in psoup.find_all("li"):
try:
print("downloading...", li.attrs['data-mp3'])
downloadSong(base_files_url + li.attrs['data-mp3'], category_name)
except KeyError:
print("Could not load", collection_url.split("/")[-1], "possibly has hiearchy")
def downloadSong(song_url, category_name):
(collection_name, outFile) = getSongFromURL(song_url);
outDir = base_out_url + category_name + "/" + collection_name
if not os.path.exists(outDir):
os.makedirs(outDir)
if not os.path.isfile(os.path.join(outDir, outFile)):
i = requests.get(song_url)
with open(os.path.join(outDir, outFile), 'wb') as tempFile:
tempFile.write(i.content)
else:
print("File", outFile, "already exists... skipping")
print("")
def getSongFromURL(song_url):
list = song_url.split("/")
return (list[4], list[-1])
def f(d):
return d.find_element_by_class_name("listContainer")

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
beautifulsoup4==4.7.1
certifi==2018.11.29
chardet==3.0.4
idna==2.8
requests==2.21.0
selenium==3.141.0
soupsieve==1.7.1
urllib3==1.24.1