Scripts para descarga de datos desde IMDb
import os
import pickle
import requests
import time
import csv
from bs4 import BeautifulSoup
import numpy as np
class IMDbDownloader:
def download_award_data(self, url):
awards = list()
req = requests.get(url)
failures = 0
while req.status_code != 200:
failures += 1
print("Error: " + str(req.status_code))
if failures > 3:
return None
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
body = soup.find('div', id='main').find('div', class_='article listo')
titles = [t for t in body if t.name == 'h3']
tables = [t for t in body if t.name == 'table']
for title, table in zip(titles, tables):
title = title.text.strip().split('\n')
ceremony = title[0]
year = int(title[1])
rows = [r for r in table if r.name=='tr']
awards_list = list()
for row in rows:
cols = row.find_all('td')
if len(cols) > 1:
outcome, category = cols[0].text.strip().split('\n')
info = cols[-1].text.strip().split('\n')
award_name = info[0]
recipients = [name.strip() for name in info[1:]]
awards_list.append({'outcome': outcome, 'category': category,
'description': award_name, 'recipients': recipients})
awards.append({'ceremony_name': ceremony, 'year': year, 'awards': awards_list})
return awards
def download_rating_data(self, url):
req = requests.get(url)
failures = 0
while req.status_code != 200:
failures += 1
print("Error: " + str(req.status_code))
if failures > 3:
return None
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
body = soup.find('div', id='main').find('div', class_='title-ratings-sub-page')
table1, table2, table3 = [t for t in body if t.name == 'table']
votes_by_rating = dict()
rows1 = [r for r in table1 if r.name=='tr']
for row in rows1[1:]:
cols = row.find_all('td')
rating = int(cols[0].text.strip())
votes = int(cols[2].text.strip().replace(',', ''))
votes_by_rating[rating] = votes
rating_by_demographic = dict()
rows2 = [r for r in table2 if r.name=='tr']
headers = [h.text.strip() for h in rows2[0].find_all('th')[1:]]
for row in rows2[1:]:
cols = [c for c in row if c.name=='td']
gender = cols[0].text.strip()
rating_by_demographic[gender] = dict()
for col, header in zip(cols[1:], headers):
info = col.text.strip().split('\n')
try:
ranking = float(info[0].strip())
votes = int(info[-1].strip().replace(',', ''))
except:
ranking = None
votes = 0
rating_by_demographic[gender][header] = {'rating': ranking, 'votes': votes}
rows3 = [r for r in table3 if r.name=='tr']
headers = [h.text.strip() for h in rows3[0].find_all('th')]
for row in rows3:
cols = [c for c in row if c.name=='td']
for col, header in zip(cols, headers):
info = col.text.strip().split('\n')
try:
ranking = float(info[0].strip())
votes = int(info[-1].strip().replace(',', ''))
except:
ranking = None
votes = 0
rating_by_demographic[header] = {'rating': ranking, 'votes': votes}
ratings = {'votes_by_score': votes_by_rating,
'rating_by_demographic': rating_by_demographic}
return ratings
def download_all(self, start=0):
path_csv = os.path.abspath('./databases/the-movies-dataset/links.csv')
with open(path_csv, "rb") as input_file:
links = np.loadtxt(input_file, delimiter=",", skiprows=1, dtype=np.str)
errors = list()
awards_url_format = 'https://www.imdb.com/title/tt{}/awards'
ratings_url_format = 'https://www.imdb.com/title/tt{}/ratings'
skiped = 0
t0 = time.time()
batch_size = 1000
indices = np.arange(0, len(links))
for i, imdb_id in zip(indices[start*batch_size:], links[start*batch_size:, 1]):
if i % batch_size == 0:
batch_number = i // batch_size
path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
if os.path.exists(path_db):
with open(path_db, 'rb') as input_file:
info = pickle.load(input_file)
else:
info = dict()
if imdb_id in info:
skiped += 1
continue
failures = 0
while True:
try:
awards = self.download_award_data(awards_url_format.format(imdb_id))
ratings = self.download_rating_data(ratings_url_format.format(imdb_id))
if awards is None or ratings is None:
print('None with ' + imdb_id)
info[imdb_id] = { 'ratings': ratings, 'awards': awards }
break
except:
failures += 1
if failures >= 3:
print('Error: {}'.format(imdb_id))
errors.append(imdb_id)
break
if (i + 1) % 50 == 0 or (i + 1) % batch_size == 0:
print('Saving results...')
with open(path_db, 'wb') as output:
pickle.dump(info, output, pickle.HIGHEST_PROTOCOL)
print('Done with {} out of {} movies...'.format(i+1, len(links)))
print('Errors: {}'.format(len(errors)))
print('Average time per movie: {:.3f}s\n'.format((time.time() - t0)/(i - start*batch_size - skiped + 1)))
with open(path_db, 'wb') as output:
pickle.dump(info, output, pickle.HIGHEST_PROTOCOL)
return info, errors
def export_to_csv(self):
path_csv = os.path.abspath('./databases/extra_info/')
with open(os.path.join(path_csv, 'extra_awards.csv'), 'w', newline='') as awards_csv,\
open(os.path.join(path_csv, 'extra_ratings_dem.csv'), 'w', newline='') as ratings_dem_csv,\
open(os.path.join(path_csv, 'extra_ratings_hist.csv'), 'w', newline='') as ratings_hist_csv:
awards_writer = csv.writer(awards_csv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
ratings_dem_writer = csv.writer(ratings_dem_csv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
ratings_hist_writer = csv.writer(ratings_hist_csv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
awards_writer.writerow(['imdbId', 'ceremony', 'year', 'category', 'outcome', 'description', 'details'])
ratings_dem_writer.writerow(['imdbId', 'gender', 'age', 'votes', 'rating'])
ratings_hist_writer.writerow(['imdbId', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
errors = 0
for batch_number in range(46):
path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
with open(path_db, 'rb') as input_file:
data = pickle.load(input_file)
for imdb_id, info in sorted(data.items(), key=lambda x: x[0]):
# Awards
if info['awards'] is not None:
for ceremony_info in info['awards']:
for award_info in ceremony_info['awards']:
try:
awards_writer.writerow([imdb_id,
ceremony_info['ceremony_name'],
ceremony_info['year'],
award_info['category'],
award_info['outcome'],
award_info['description'],
award_info['recipients']
])
except:
errors += 1
print('Error in {}'.format(imdb_id))
# Ratings
if info['ratings'] is not None:
ratings_dem_info = info['ratings']['rating_by_demographic']
for gender in ['Males', 'Females', 'All']:
for age in ['<18', '18-29', '30-44', '45+', 'All Ages']:
ratings_dem_writer.writerow([imdb_id,
gender,
age,
ratings_dem_info[gender][age]['votes'],
ratings_dem_info[gender][age]['rating']
])
ratings_hist_info = info['ratings']['votes_by_score']
hist_row = [imdb_id]
for score, votes in sorted(ratings_hist_info.items(), key=lambda x: x[0]):
hist_row.append(votes)
ratings_hist_writer.writerow(hist_row)
print(errors)
if __name__=="__main__":
downloader = IMDbDownloader()
downloader.download_all()
downloader.export_to_csv()