adding categorical caching

This commit is contained in:
Joey Eamigh
2024-04-18 16:19:01 -04:00
parent 394089513c
commit dcc96e130d
3 changed files with 68 additions and 43 deletions

3
.gitignore vendored
View File

@@ -1 +1,4 @@
.venv .venv
out
.DS_Store
__pycache__

View File

@@ -0,0 +1,54 @@
import numpy as np
import pandas as pd
def categorize():
df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0)
print(f"initial number of movies: {len(df.index)}")
df = df[df[["Metascore"]].notnull().all(1)]
df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6
df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4]
df["Duration"] = df["Duration"].astype(int)
df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int)
df["IMDB Rating"] = df["IMDB Rating"].astype(float)
df["Metascore"] = df["Metascore"].astype(int)
df["Director"] = pd.Categorical(df["Director"])
df["Cast"] = pd.Categorical(df["Cast"])
df["Release Year"] = pd.Categorical(df["Release Year"])
director_cat = np.array(df["Director"].cat.categories)
actor_cat = np.array(df["Cast"].cat.categories)
release_year_cat = np.array(df["Release Year"].cat.categories)
df[["Genre 1", "Genre 2", "Genre 3"]] = (
df["Genre"].str.split(", ", expand=True).astype("category")
)
genre_cat = np.unique(
np.concatenate(
[
df["Genre 1"].cat.categories,
df["Genre 2"].cat.categories,
df["Genre 3"].cat.categories,
]
)
)
df.drop(columns=["Genre"], inplace=True)
df = df[
df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]]
.notnull()
.all(1)
]
print(f"number of movies after cleaning: {len(df.index)}")
with open("./out/categoricals.csv", "w") as f:
f.write(df.to_csv())
return df

View File

@@ -1,54 +1,19 @@
import os, csv
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import statsmodels.api as sm import statsmodels.api as sm
import statsmodels.formula.api as smf import statsmodels.formula.api as smf
from patsy import dmatrices from patsy import dmatrices
from categorize import categorize
df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0) if not os.path.exists("./out"):
print(f"initial number of movies: {len(df.index)}") os.makedirs("./out")
if not os.path.exists("./out/categoricals.csv"):
df = df[df[["Metascore"]].notnull().all(1)] df = categorize()
else:
df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6 df = pd.read_csv("./out/categoricals.csv", index_col=0)
df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4]
df["Duration"] = df["Duration"].astype(int)
df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int)
df["IMDB Rating"] = df["IMDB Rating"].astype(float)
df["Metascore"] = df["Metascore"].astype(int)
df["Director"] = pd.Categorical(df["Director"])
df["Cast"] = pd.Categorical(df["Cast"])
df["Release Year"] = pd.Categorical(df["Release Year"])
director_cat = np.array(df["Director"].cat.categories)
actor_cat = np.array(df["Cast"].cat.categories)
release_year_cat = np.array(df["Release Year"].cat.categories)
df[["Genre 1", "Genre 2", "Genre 3"]] = (
df["Genre"].str.split(", ", expand=True).astype("category")
)
genre_cat = np.unique(
np.concatenate(
[
df["Genre 1"].cat.categories,
df["Genre 2"].cat.categories,
df["Genre 3"].cat.categories,
]
)
)
df.drop(columns=["Genre"], inplace=True)
df = df[
df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]].notnull().all(1)
]
print(f"number of movies after cleaning: {len(df.index)}")
target = "Gross" target = "Gross"
@@ -75,3 +40,6 @@ model = sm.OLS(y, X)
results = model.fit() results = model.fit()
print(results.summary()) print(results.summary())
with open("./out/model_summary.csv", "w") as f:
f.write(results.summary().as_csv())