diff --git a/.gitignore b/.gitignore index 1d17dae..b603c38 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ .venv +out +.DS_Store +__pycache__ diff --git a/busi410project/categorize.py b/busi410project/categorize.py new file mode 100644 index 0000000..5ed455c --- /dev/null +++ b/busi410project/categorize.py @@ -0,0 +1,54 @@ +import numpy as np +import pandas as pd + + +def categorize(): + df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0) + print(f"initial number of movies: {len(df.index)}") + + df = df[df[["Metascore"]].notnull().all(1)] + + df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6 + df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4] + + df["Duration"] = df["Duration"].astype(int) + df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int) + + df["IMDB Rating"] = df["IMDB Rating"].astype(float) + df["Metascore"] = df["Metascore"].astype(int) + + df["Director"] = pd.Categorical(df["Director"]) + df["Cast"] = pd.Categorical(df["Cast"]) + df["Release Year"] = pd.Categorical(df["Release Year"]) + + director_cat = np.array(df["Director"].cat.categories) + actor_cat = np.array(df["Cast"].cat.categories) + release_year_cat = np.array(df["Release Year"].cat.categories) + + df[["Genre 1", "Genre 2", "Genre 3"]] = ( + df["Genre"].str.split(", ", expand=True).astype("category") + ) + + genre_cat = np.unique( + np.concatenate( + [ + df["Genre 1"].cat.categories, + df["Genre 2"].cat.categories, + df["Genre 3"].cat.categories, + ] + ) + ) + + df.drop(columns=["Genre"], inplace=True) + + df = df[ + df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]] + .notnull() + .all(1) + ] + print(f"number of movies after cleaning: {len(df.index)}") + + with open("./out/categoricals.csv", "w") as f: + f.write(df.to_csv()) + + return df diff --git a/busi410project/fit.py b/busi410project/fit.py index f00e516..0ac03c1 100644 --- a/busi410project/fit.py +++ b/busi410project/fit.py @@ -1,54 +1,19 @@ +import os, csv import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from patsy import dmatrices +from categorize import categorize -df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0) -print(f"initial number of movies: {len(df.index)}") +if not os.path.exists("./out"): + os.makedirs("./out") - -df = df[df[["Metascore"]].notnull().all(1)] - -df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6 -df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4] - -df["Duration"] = df["Duration"].astype(int) -df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int) - -df["IMDB Rating"] = df["IMDB Rating"].astype(float) -df["Metascore"] = df["Metascore"].astype(int) - - -df["Director"] = pd.Categorical(df["Director"]) -df["Cast"] = pd.Categorical(df["Cast"]) -df["Release Year"] = pd.Categorical(df["Release Year"]) - -director_cat = np.array(df["Director"].cat.categories) -actor_cat = np.array(df["Cast"].cat.categories) -release_year_cat = np.array(df["Release Year"].cat.categories) - -df[["Genre 1", "Genre 2", "Genre 3"]] = ( - df["Genre"].str.split(", ", expand=True).astype("category") -) - -genre_cat = np.unique( - np.concatenate( - [ - df["Genre 1"].cat.categories, - df["Genre 2"].cat.categories, - df["Genre 3"].cat.categories, - ] - ) -) - -df.drop(columns=["Genre"], inplace=True) - -df = df[ - df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]].notnull().all(1) -] -print(f"number of movies after cleaning: {len(df.index)}") +if not os.path.exists("./out/categoricals.csv"): + df = categorize() +else: + df = pd.read_csv("./out/categoricals.csv", index_col=0) target = "Gross" @@ -75,3 +40,6 @@ model = sm.OLS(y, X) results = model.fit() print(results.summary()) + +with open("./out/model_summary.csv", "w") as f: + f.write(results.summary().as_csv())