Scripts para descarga de datos desde IMDb

import os
import pickle
import requests
import time
import csv

from bs4 import BeautifulSoup
import numpy as np

class IMDbDownloader:
  def download_award_data(self, url):
    awards = list()
    req = requests.get(url)

    failures = 0
    while req.status_code != 200:
      failures += 1
      print("Error: " + str(req.status_code))
      if failures > 3:
        return None
      req = requests.get(url)

    soup = BeautifulSoup(req.text, 'html.parser')
    body = soup.find('div', id='main').find('div', class_='article listo')
    titles = [t for t in body if t.name == 'h3']
    tables = [t for t in body if t.name == 'table']

    for title, table in zip(titles, tables):
      title = title.text.strip().split('\n')
      ceremony = title[0]
      year = int(title[1])

      rows = [r for r in table if r.name=='tr']
      awards_list = list()
      for row in rows:
        cols = row.find_all('td')
        if len(cols) > 1:
          outcome, category = cols[0].text.strip().split('\n')
        info = cols[-1].text.strip().split('\n')
        award_name = info[0]
        recipients = [name.strip() for name in info[1:]]
        awards_list.append({'outcome': outcome, 'category': category,
                  'description': award_name, 'recipients': recipients})
      awards.append({'ceremony_name': ceremony, 'year': year, 'awards': awards_list})

    return awards

  def download_rating_data(self, url):
    req = requests.get(url)

    failures = 0
    while req.status_code != 200:
      failures += 1
      print("Error: " + str(req.status_code))
      if failures > 3:
        return None
      req = requests.get(url)

    soup = BeautifulSoup(req.text, 'html.parser')
    body = soup.find('div', id='main').find('div', class_='title-ratings-sub-page')
    table1, table2, table3 = [t for t in body if t.name == 'table']

    votes_by_rating = dict()
    rows1 = [r for r in table1 if r.name=='tr']
    for row in rows1[1:]:
      cols = row.find_all('td')
      rating = int(cols[0].text.strip())
      votes = int(cols[2].text.strip().replace(',', ''))
      votes_by_rating[rating] = votes

    rating_by_demographic = dict()
    rows2 = [r for r in table2 if r.name=='tr']
    headers = [h.text.strip() for h in rows2[0].find_all('th')[1:]]
    for row in rows2[1:]:
      cols = [c for c in row if c.name=='td']
      gender = cols[0].text.strip()
      rating_by_demographic[gender] = dict()
      for col, header in zip(cols[1:], headers):
        info = col.text.strip().split('\n')
        try:
          ranking = float(info[0].strip())
          votes = int(info[-1].strip().replace(',', ''))
        except:
          ranking = None
          votes = 0
        rating_by_demographic[gender][header] = {'rating': ranking, 'votes': votes}

    rows3 = [r for r in table3 if r.name=='tr']
    headers = [h.text.strip() for h in rows3[0].find_all('th')]
    for row in rows3:
      cols = [c for c in row if c.name=='td']
      for col, header in zip(cols, headers):
        info = col.text.strip().split('\n')
        try:
          ranking = float(info[0].strip())
          votes = int(info[-1].strip().replace(',', ''))
        except:
          ranking = None
          votes = 0
        rating_by_demographic[header] = {'rating': ranking, 'votes': votes}

    ratings = {'votes_by_score': votes_by_rating,
           'rating_by_demographic': rating_by_demographic}

    return ratings

  def download_all(self, start=0):
    path_csv = os.path.abspath('./databases/the-movies-dataset/links.csv')
    with open(path_csv, "rb") as input_file:
      links = np.loadtxt(input_file, delimiter=",", skiprows=1, dtype=np.str)

    errors = list()
    awards_url_format = 'https://www.imdb.com/title/tt{}/awards'
    ratings_url_format = 'https://www.imdb.com/title/tt{}/ratings'
    skiped = 0
    t0 = time.time()
    batch_size = 1000
    indices = np.arange(0, len(links))
    for i, imdb_id in zip(indices[start*batch_size:], links[start*batch_size:, 1]):
      if i % batch_size == 0:
        batch_number = i // batch_size
        path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
        if os.path.exists(path_db):
          with open(path_db, 'rb') as input_file:
            info = pickle.load(input_file)
        else:
          info = dict()
      if imdb_id in info:
        skiped += 1
        continue
      failures = 0
      while True:
        try:
          awards = self.download_award_data(awards_url_format.format(imdb_id))
          ratings = self.download_rating_data(ratings_url_format.format(imdb_id))
          if awards is None or ratings is None:
            print('None with ' + imdb_id)
          info[imdb_id] = { 'ratings': ratings, 'awards': awards }
          break
        except:
          failures += 1
          if failures >= 3:
            print('Error: {}'.format(imdb_id))
            errors.append(imdb_id)
            break
      if (i + 1) % 50 == 0 or (i + 1) % batch_size == 0:
        print('Saving results...')
        with open(path_db, 'wb') as output:
          pickle.dump(info, output, pickle.HIGHEST_PROTOCOL)
      print('Done with {} out of {} movies...'.format(i+1, len(links)))
      print('Errors: {}'.format(len(errors)))
      print('Average time per movie: {:.3f}s\n'.format((time.time() - t0)/(i - start*batch_size - skiped + 1)))

    with open(path_db, 'wb') as output:
      pickle.dump(info, output, pickle.HIGHEST_PROTOCOL)
    return info, errors

  def export_to_csv(self):
    path_csv = os.path.abspath('./databases/extra_info/')
    with open(os.path.join(path_csv, 'extra_awards.csv'), 'w', newline='') as awards_csv,\
    open(os.path.join(path_csv, 'extra_ratings_dem.csv'), 'w', newline='') as ratings_dem_csv,\
    open(os.path.join(path_csv, 'extra_ratings_hist.csv'), 'w', newline='') as ratings_hist_csv:
      awards_writer = csv.writer(awards_csv, delimiter=',',
                  quotechar='"', quoting=csv.QUOTE_MINIMAL)
      ratings_dem_writer = csv.writer(ratings_dem_csv, delimiter=',',
                  quotechar='"', quoting=csv.QUOTE_MINIMAL)
      ratings_hist_writer = csv.writer(ratings_hist_csv, delimiter=',',
                  quotechar='"', quoting=csv.QUOTE_MINIMAL)
      awards_writer.writerow(['imdbId', 'ceremony', 'year', 'category', 'outcome', 'description', 'details'])
      ratings_dem_writer.writerow(['imdbId', 'gender', 'age', 'votes', 'rating'])
      ratings_hist_writer.writerow(['imdbId', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
      errors = 0
      for batch_number in range(46):
        path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
        with open(path_db, 'rb') as input_file:
          data = pickle.load(input_file)
        for imdb_id, info in sorted(data.items(), key=lambda x: x[0]):
          # Awards
          if info['awards'] is not None:
            for ceremony_info in info['awards']:
              for award_info in ceremony_info['awards']:
                try:
                  awards_writer.writerow([imdb_id,
                              ceremony_info['ceremony_name'],
                              ceremony_info['year'],
                              award_info['category'],
                              award_info['outcome'],
                              award_info['description'],
                              award_info['recipients']
                              ])
                except:
                  errors += 1
                  print('Error in {}'.format(imdb_id))
          # Ratings
          if info['ratings'] is not None:
            ratings_dem_info = info['ratings']['rating_by_demographic']
            for gender in ['Males', 'Females', 'All']:
              for age in ['<18', '18-29', '30-44', '45+', 'All Ages']:
                ratings_dem_writer.writerow([imdb_id,
                             gender,
                             age,
                             ratings_dem_info[gender][age]['votes'],
                             ratings_dem_info[gender][age]['rating']
                             ])
            ratings_hist_info = info['ratings']['votes_by_score']
            hist_row = [imdb_id]
            for score, votes in sorted(ratings_hist_info.items(), key=lambda x: x[0]):
              hist_row.append(votes)
            ratings_hist_writer.writerow(hist_row)
      print(errors)

if __name__=="__main__":
  downloader = IMDbDownloader()
  downloader.download_all()
  downloader.export_to_csv()