Skip to content

Commit

Permalink
GH-4 updated sample python plotting tools
Browse files Browse the repository at this point in the history
  • Loading branch information
bkhanUVA committed Sep 19, 2021
1 parent f09fc10 commit b6a4682
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 11 deletions.
21 changes: 17 additions & 4 deletions python/analysis/get_basic_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ def _calculate_average_rating_total(imdb_wide_df: pd.DataFrame):


# Create object to represent nested dict....
def _generate_statistics(imdb_long_df: pd.DataFrame, imdb_wide_df: pd.DataFrame) -> Dict[str, Union[str, Dict[str, int]]]:
def _generate_statistics(type: str, imdb_long_df: pd.DataFrame, imdb_wide_df: pd.DataFrame) -> Dict[str, Union[str, Dict[str, int]]]:
stats_dict = {
'average_rating_genre': _calculate_average_rating_by_genre(imdb_long_df),
'average_rating_total': _calculate_average_rating_total(imdb_wide_df)
f"{type}_average_rating_genre": _calculate_average_rating_by_genre(imdb_long_df),
f"{type}_average_rating_total": _calculate_average_rating_total(imdb_wide_df)
}
return stats_dict

Expand All @@ -99,7 +99,20 @@ def output_json(out_path: str, out_filename: str, stats_dict: Dict[str, Union[st
def main():
env = _read_args(sys.argv[1:])
imdb_long_df, imdb_wide_df = _import_cleaned_ratings_data(env)
stats_dict = _generate_statistics(imdb_long_df, imdb_wide_df)
imdb_stats_dict = _generate_statistics('imdb', imdb_long_df, imdb_wide_df)
# Just re-use imdb dfs for now so we can test plotting service
user_stats_dict = _generate_statistics('user', imdb_long_df, imdb_wide_df)
# TEMP - minus 1 from user scores since we're currently reusing imdb sample
# to build user average as well. This will allow for easier testing of plots
for k in user_stats_dict:
if k == 'user_average_rating_genre':
for k2 in user_stats_dict[k]:
user_stats_dict[k][k2] = user_stats_dict[k][k2] - 1
else:
user_stats_dict[k] = user_stats_dict[k] - 1
imdb_stats_dict.update(user_stats_dict)
print(imdb_stats_dict)
stats_dict = imdb_stats_dict
output_json(env.outPath, 'full_imdb_stats', stats_dict)

if __name__ == "__main__":
Expand Down
46 changes: 39 additions & 7 deletions python/analysis/plotting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,69 @@ def import_json(input_path: str) -> Dict[str, Union[str, Dict[str, int]]]:
return json.loads(f.read())


def _parse_json(imdb_stats_dict: Dict[str, Union[str, Dict[str, int]]]
) -> Tuple[pd.DataFrame, Dict[str, int]]:
def _parse_json(
imdb_stats_dict: Dict[str, Union[str, Dict[str, int]]]
) -> Tuple[pd.DataFrame, Dict[str, int]]:

# TODO: Merge user and genre ratings imports into a single function
genre_ratings_df = pd.DataFrame.from_dict(
imdb_stats_dict['average_rating_genre'],
imdb_stats_dict['imdb_average_rating_genre'],
orient='index',
columns=['rating']
)
genre_ratings_df = genre_ratings_df.reset_index()
genre_ratings_df = genre_ratings_df.rename(columns = {'index': 'genre'})
genre_ratings_df['type'] = 'imdb'
genre_ratings_df2 = pd.DataFrame.from_dict(
imdb_stats_dict['user_average_rating_genre'],
orient='index',
columns=['rating']
)
genre_ratings_df2 = genre_ratings_df2.reset_index()
genre_ratings_df2 = genre_ratings_df2.rename(columns = {'index': 'genre'})
genre_ratings_df2['type'] = 'user'
genre_ratings_df = pd.concat([genre_ratings_df, genre_ratings_df2])
return (
genre_ratings_df,
imdb_stats_dict['average_rating_total']
# TODO: Clean the dictionary return logic up
{'imdb_average_rating_total':imdb_stats_dict['imdb_average_rating_total'],
'user_average_rating_total':imdb_stats_dict['user_average_rating_total']}
)


# Plot 1 - Average Rating Over Time

def plot_avg_rating_by_genre(genre_ratings_df: pd.DataFrame) -> plt.axes:
def plot_avg_rating_by_genre(
genre_ratings_df: pd.DataFrame
) -> plt.axes:
# Plot 2 - Average Rating by Genre
genre_ratings_df = genre_ratings_df.loc[genre_ratings_df['type'] == 'imdb']
sns.set(style='ticks')
ax = sns.barplot(data=genre_ratings_df, x='genre', y='rating', color='darkblue')
ax.set_ylim([1, 10])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Genres', fontweight='heavy')
plt.ylabel('Ratings', fontweight='heavy')
# return plt?
return ax

# Plot 3 - Ratings Distribution

# Plot 4 - Average Rating by Runtime

# Plot 5 - User Ratings by Genre vs imdb (all users)
def plot_user_rating_vs_imdb_by_genre(
genre_ratings_df: pd.DataFrame, average_ratings: Dict[str, float]
) -> plt.axes:
# Plot 2 - Average Rating by Genre
sns.set(style='ticks')
ax = sns.catplot(data=genre_ratings_df, kind='bar', x='genre', y='rating', hue='type')
ax.set(ylim=[1, 10])
# ToDO Clean this up.... tilt x tick labels so they are more readable
ax.set(title = f"IMDB Average Rating across all genres is {average_ratings['imdb_average_rating_total']}, while User Average is {average_ratings['user_average_rating_total']}")
plt.xlabel('Genres', fontweight='heavy')
plt.ylabel('Ratings', fontweight='heavy')
# return plt?
return plt

# Plot 6 - User Average Rating vs imdb (all users)

Expand All @@ -71,8 +101,10 @@ def main():
# Temporary main function - delete this later
json_file = os.path.expanduser('~/Desktop/full_imdb_stats.json')
test_dict = import_json(json_file)
imdb_genre_ratings_df, imdb_average_rating = _parse_json(test_dict)
imdb_genre_ratings_df, average_ratings = _parse_json(test_dict)
plot_avg_rating_by_genre(imdb_genre_ratings_df)
test_plt = plot_user_rating_vs_imdb_by_genre(imdb_genre_ratings_df, average_ratings)
test_plt.savefig('test_plt.png')



Expand Down

0 comments on commit b6a4682

Please sign in to comment.