Scripts para filtrado preliminar de los datos

import os
import pickle
import csv
import ast

import numpy as np

class Preprocessor:
  def __init__(self):
    pass

  def create_additional_files(self):
    path_csv = os.path.abspath('./databases/the-movies-dataset/')
    path_out = os.path.abspath('./databases/extra_info/')
    if not os.path.exists(path_out):
      os.makedirs(path_out)

    # Country frequencies
    with open(os.path.join(path_out, 'country_frequencies.csv'), 'w', newline='') as country_freq, open(os.path.join(path_csv, 'movies_metadata.csv'), 'r', newline='', encoding='utf-8') as metadata_csv:
      country_writer = csv.writer(country_freq, delimiter=',',
                  quotechar='"', quoting=csv.QUOTE_MINIMAL)
      metadata_reader = csv.reader(metadata_csv, delimiter=',',
                  quotechar='"')
      country_count = dict()
      country_codes = dict()
      for i, row in enumerate(metadata_reader):
        if i==0:
          continue
        try:
          country_list = ast.literal_eval(row[13])
        except:
          print(row)
        if not isinstance(country_list, list):
          continue
        for item in country_list:
          country_code = item['iso_3166_1']
          country_name = item['name']
          if country_code in country_codes:
            assert country_codes[country_code] == country_name, 'Same code {} for {} and {}'.format(country_code, country_name, country_codes[country_code])
            country_count[country_code] += 1
          else:
            country_codes[country_code] = country_name
            country_count[country_code] = 1
      country_writer.writerow(['country_code', 'country_name', 'count'])
      for country_code, count in country_count.items():
        try:
          country_writer.writerow([country_code, country_codes[country_code], count])
        except:
          print(country_codes[country_code])

    # Keyword frequencies
    with open(os.path.join(path_out, 'keyword_frequencies.csv'), 'w', newline='') as keyword_freq, open(os.path.join(path_csv, 'keywords.csv'), 'r', newline='', encoding='utf-8') as keywords_csv:
      keyword_writer = csv.writer(keyword_freq, delimiter=',',
                  quotechar='"', quoting=csv.QUOTE_MINIMAL)
      keyword_reader = csv.reader(keywords_csv, delimiter=',',
                  quotechar='"')
      keyword_count = dict()
      keyword_ids = dict()
      for i, row in enumerate(keyword_reader):
        if i==0:
          continue
        keyword_list = ast.literal_eval(row[1])
        for item in keyword_list:
          keyword_id = item['id']
          keyword_name = item['name']
          if keyword_id in keyword_ids:
            assert keyword_ids[keyword_id] == keyword_name, 'Same id {} for {} and {}'.format(keyword_id, keyword_name, keyword_ids[keyword_id])
            keyword_count[keyword_id] += 1
          else:
            keyword_ids[keyword_id] = keyword_name
            keyword_count[keyword_id] = 1
      keyword_writer.writerow(['keyword_id', 'keyword', 'count'])
      for keyword_id, count in keyword_count.items():
        try:
          keyword_writer.writerow([keyword_id, keyword_ids[keyword_id], count])
        except:
          print(keyword_ids[keyword_id])

    # Awards condensed
    with open(os.path.join(path_out, 'awards_condensed.csv'), 'w', newline='') as awards_condensed_csv:
      awards_writer = csv.writer(awards_condensed_csv, delimiter=',',
                     quotechar='"', quoting=csv.QUOTE_MINIMAL)
      errors = 0
      awards_writer.writerow(['imdbId', 'ceremony', 'year', 'nominations', 'wins'])
      for batch_number in range(46):
        path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
        with open(path_db, 'rb') as input_file:
          data = pickle.load(input_file)
        for imdb_id, info in sorted(data.items(), key=lambda x: x[0]):
          # Awards
          if info['awards'] is not None:
            for ceremony_info in info['awards']:
              try:
                nominations = len([a for a in ceremony_info['awards'] if a['outcome'].lower()=='nominee'])
                wins = len([a for a in ceremony_info['awards'] if a['outcome'].lower()=='winner'])
                awards_writer.writerow([imdb_id,
                            ceremony_info['ceremony_name'],
                            ceremony_info['year'],
                            nominations,
                            wins
                            ])
              except:
                errors += 1
                print('Error in {}'.format(imdb_id))
      print(errors)

    # Filtered movie data
    ratings_data = dict()
    for batch_number in range(46):
      path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
      with open(path_db, 'rb') as input_file:
        data = pickle.load(input_file)
      for imdb_id, info in data.items():
        ratings_data[imdb_id] = info['ratings']
    errors = 0
    no_rating = 0
    with open(os.path.join(path_out, 'filtered_movie_data.csv'), 'w', newline='') as filtered_data_csv, open(os.path.join(path_csv, 'movies_metadata.csv'), 'r', newline='', encoding='utf-8') as metadata_csv, open(os.path.join(path_out, 'tst.csv'), 'w', newline='') as tstcsv:
      data_writer = csv.writer(filtered_data_csv, delimiter=',',
                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
      tst_writer = csv.writer(tstcsv, delimiter=',',
                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
      metadata_reader = csv.reader(metadata_csv, delimiter=',',
                     quotechar='"')
      remove_cols = ['imdbId', 'vote_average', 'vote_count', 'poster_path', 'homepage']
      for i, row in enumerate(metadata_reader):
        try:
          if i==0:
            headers = row
            filtered_header = ['imdbId'] + [h for h in headers if h not in remove_cols] + ['release_year', 'vote_average', 'vote_count', 'vote_std', 'vote_std_by_gender', 'vote_std_by_age']
            data_writer.writerow(filtered_header)
            continue
          imdbId = row[6][2:]
          if imdbId not in ratings_data:
            no_rating += 1
            print(imdbId)
            continue
          filtered_row = [imdbId]
          for j, value in enumerate(row):
            if headers[j] in remove_cols:
              continue
            try:
              tst_writer.writerow([value])
              filtered_row.append(value)
            except:
              filtered_row.append(None)
            if headers[j] == 'release_date':
              release_year = int(value[:4])
          filtered_row.append(release_year)
          votes_mean = ratings_data[imdbId]['rating_by_demographic']['All']['All Ages']['rating']
          filtered_row.append(votes_mean)
          votes_count = ratings_data[imdbId]['rating_by_demographic']['All']['All Ages']['votes']
          filtered_row.append(votes_count)

          votes_hist = np.zeros((10,))
          for j, (score, votes) in enumerate(sorted(ratings_data[imdbId]['votes_by_score'].items(), key=lambda x: x[0])):
            votes_hist[j] = votes
          try:
            votes_std = np.sqrt(np.sum(votes_hist*(np.arange(1,11) - votes_mean)**2)/votes_count)
          except:
            votes_std = None
          filtered_row.append(votes_std)

          rating_by_gender = np.zeros((2,))
          for j, gender in enumerate(['Males', 'Females']):
            rating_by_gender[j] = ratings_data[imdbId]['rating_by_demographic'][gender]['All Ages']['rating']
          try:
            votes_std_by_gender = np.sqrt(np.sum((rating_by_gender - votes_mean)**2)/votes_count)
          except:
            votes_std_by_gender = None
          filtered_row.append(votes_std_by_gender)

          rating_by_age = np.zeros((4,))
          for j, age in enumerate(['<18', '18-29', '30-44', '45+']):
            rating_by_age[j] = ratings_data[imdbId]['rating_by_demographic']['All'][age]['rating']
          try:
            votes_std_by_age = np.sqrt(np.sum((rating_by_age - votes_mean)**2)/votes_count)
          except:
            votes_std_by_age = None
          filtered_row.append(votes_std_by_age)

          data_writer.writerow(filtered_row)
        except:
          errors += 1
          print(imdbId)
          continue
    print(errors)
    print(no_rating)

if __name__=="__main__":
  preprocessor = Preprocessor()
  preprocessor.create_additional_files()