Scripts para filtrado preliminar de los datos
import os
import pickle
import csv
import ast
import numpy as np
class Preprocessor:
def __init__(self):
pass
def create_additional_files(self):
path_csv = os.path.abspath('./databases/the-movies-dataset/')
path_out = os.path.abspath('./databases/extra_info/')
if not os.path.exists(path_out):
os.makedirs(path_out)
# Country frequencies
with open(os.path.join(path_out, 'country_frequencies.csv'), 'w', newline='') as country_freq, open(os.path.join(path_csv, 'movies_metadata.csv'), 'r', newline='', encoding='utf-8') as metadata_csv:
country_writer = csv.writer(country_freq, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
metadata_reader = csv.reader(metadata_csv, delimiter=',',
quotechar='"')
country_count = dict()
country_codes = dict()
for i, row in enumerate(metadata_reader):
if i==0:
continue
try:
country_list = ast.literal_eval(row[13])
except:
print(row)
if not isinstance(country_list, list):
continue
for item in country_list:
country_code = item['iso_3166_1']
country_name = item['name']
if country_code in country_codes:
assert country_codes[country_code] == country_name, 'Same code {} for {} and {}'.format(country_code, country_name, country_codes[country_code])
country_count[country_code] += 1
else:
country_codes[country_code] = country_name
country_count[country_code] = 1
country_writer.writerow(['country_code', 'country_name', 'count'])
for country_code, count in country_count.items():
try:
country_writer.writerow([country_code, country_codes[country_code], count])
except:
print(country_codes[country_code])
# Keyword frequencies
with open(os.path.join(path_out, 'keyword_frequencies.csv'), 'w', newline='') as keyword_freq, open(os.path.join(path_csv, 'keywords.csv'), 'r', newline='', encoding='utf-8') as keywords_csv:
keyword_writer = csv.writer(keyword_freq, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
keyword_reader = csv.reader(keywords_csv, delimiter=',',
quotechar='"')
keyword_count = dict()
keyword_ids = dict()
for i, row in enumerate(keyword_reader):
if i==0:
continue
keyword_list = ast.literal_eval(row[1])
for item in keyword_list:
keyword_id = item['id']
keyword_name = item['name']
if keyword_id in keyword_ids:
assert keyword_ids[keyword_id] == keyword_name, 'Same id {} for {} and {}'.format(keyword_id, keyword_name, keyword_ids[keyword_id])
keyword_count[keyword_id] += 1
else:
keyword_ids[keyword_id] = keyword_name
keyword_count[keyword_id] = 1
keyword_writer.writerow(['keyword_id', 'keyword', 'count'])
for keyword_id, count in keyword_count.items():
try:
keyword_writer.writerow([keyword_id, keyword_ids[keyword_id], count])
except:
print(keyword_ids[keyword_id])
# Awards condensed
with open(os.path.join(path_out, 'awards_condensed.csv'), 'w', newline='') as awards_condensed_csv:
awards_writer = csv.writer(awards_condensed_csv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
errors = 0
awards_writer.writerow(['imdbId', 'ceremony', 'year', 'nominations', 'wins'])
for batch_number in range(46):
path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
with open(path_db, 'rb') as input_file:
data = pickle.load(input_file)
for imdb_id, info in sorted(data.items(), key=lambda x: x[0]):
# Awards
if info['awards'] is not None:
for ceremony_info in info['awards']:
try:
nominations = len([a for a in ceremony_info['awards'] if a['outcome'].lower()=='nominee'])
wins = len([a for a in ceremony_info['awards'] if a['outcome'].lower()=='winner'])
awards_writer.writerow([imdb_id,
ceremony_info['ceremony_name'],
ceremony_info['year'],
nominations,
wins
])
except:
errors += 1
print('Error in {}'.format(imdb_id))
print(errors)
# Filtered movie data
ratings_data = dict()
for batch_number in range(46):
path_db = os.path.abspath('./databases/the-movies-dataset/extra_info{}.pkl'.format(batch_number))
with open(path_db, 'rb') as input_file:
data = pickle.load(input_file)
for imdb_id, info in data.items():
ratings_data[imdb_id] = info['ratings']
errors = 0
no_rating = 0
with open(os.path.join(path_out, 'filtered_movie_data.csv'), 'w', newline='') as filtered_data_csv, open(os.path.join(path_csv, 'movies_metadata.csv'), 'r', newline='', encoding='utf-8') as metadata_csv, open(os.path.join(path_out, 'tst.csv'), 'w', newline='') as tstcsv:
data_writer = csv.writer(filtered_data_csv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
tst_writer = csv.writer(tstcsv, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
metadata_reader = csv.reader(metadata_csv, delimiter=',',
quotechar='"')
remove_cols = ['imdbId', 'vote_average', 'vote_count', 'poster_path', 'homepage']
for i, row in enumerate(metadata_reader):
try:
if i==0:
headers = row
filtered_header = ['imdbId'] + [h for h in headers if h not in remove_cols] + ['release_year', 'vote_average', 'vote_count', 'vote_std', 'vote_std_by_gender', 'vote_std_by_age']
data_writer.writerow(filtered_header)
continue
imdbId = row[6][2:]
if imdbId not in ratings_data:
no_rating += 1
print(imdbId)
continue
filtered_row = [imdbId]
for j, value in enumerate(row):
if headers[j] in remove_cols:
continue
try:
tst_writer.writerow([value])
filtered_row.append(value)
except:
filtered_row.append(None)
if headers[j] == 'release_date':
release_year = int(value[:4])
filtered_row.append(release_year)
votes_mean = ratings_data[imdbId]['rating_by_demographic']['All']['All Ages']['rating']
filtered_row.append(votes_mean)
votes_count = ratings_data[imdbId]['rating_by_demographic']['All']['All Ages']['votes']
filtered_row.append(votes_count)
votes_hist = np.zeros((10,))
for j, (score, votes) in enumerate(sorted(ratings_data[imdbId]['votes_by_score'].items(), key=lambda x: x[0])):
votes_hist[j] = votes
try:
votes_std = np.sqrt(np.sum(votes_hist*(np.arange(1,11) - votes_mean)**2)/votes_count)
except:
votes_std = None
filtered_row.append(votes_std)
rating_by_gender = np.zeros((2,))
for j, gender in enumerate(['Males', 'Females']):
rating_by_gender[j] = ratings_data[imdbId]['rating_by_demographic'][gender]['All Ages']['rating']
try:
votes_std_by_gender = np.sqrt(np.sum((rating_by_gender - votes_mean)**2)/votes_count)
except:
votes_std_by_gender = None
filtered_row.append(votes_std_by_gender)
rating_by_age = np.zeros((4,))
for j, age in enumerate(['<18', '18-29', '30-44', '45+']):
rating_by_age[j] = ratings_data[imdbId]['rating_by_demographic']['All'][age]['rating']
try:
votes_std_by_age = np.sqrt(np.sum((rating_by_age - votes_mean)**2)/votes_count)
except:
votes_std_by_age = None
filtered_row.append(votes_std_by_age)
data_writer.writerow(filtered_row)
except:
errors += 1
print(imdbId)
continue
print(errors)
print(no_rating)
if __name__=="__main__":
preprocessor = Preprocessor()
preprocessor.create_additional_files()