adding categorical caching
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1 +1,4 @@
|
|||||||
.venv
|
.venv
|
||||||
|
out
|
||||||
|
.DS_Store
|
||||||
|
__pycache__
|
||||||
|
|||||||
54
busi410project/categorize.py
Normal file
54
busi410project/categorize.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def categorize():
|
||||||
|
df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0)
|
||||||
|
print(f"initial number of movies: {len(df.index)}")
|
||||||
|
|
||||||
|
df = df[df[["Metascore"]].notnull().all(1)]
|
||||||
|
|
||||||
|
df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6
|
||||||
|
df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4]
|
||||||
|
|
||||||
|
df["Duration"] = df["Duration"].astype(int)
|
||||||
|
df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int)
|
||||||
|
|
||||||
|
df["IMDB Rating"] = df["IMDB Rating"].astype(float)
|
||||||
|
df["Metascore"] = df["Metascore"].astype(int)
|
||||||
|
|
||||||
|
df["Director"] = pd.Categorical(df["Director"])
|
||||||
|
df["Cast"] = pd.Categorical(df["Cast"])
|
||||||
|
df["Release Year"] = pd.Categorical(df["Release Year"])
|
||||||
|
|
||||||
|
director_cat = np.array(df["Director"].cat.categories)
|
||||||
|
actor_cat = np.array(df["Cast"].cat.categories)
|
||||||
|
release_year_cat = np.array(df["Release Year"].cat.categories)
|
||||||
|
|
||||||
|
df[["Genre 1", "Genre 2", "Genre 3"]] = (
|
||||||
|
df["Genre"].str.split(", ", expand=True).astype("category")
|
||||||
|
)
|
||||||
|
|
||||||
|
genre_cat = np.unique(
|
||||||
|
np.concatenate(
|
||||||
|
[
|
||||||
|
df["Genre 1"].cat.categories,
|
||||||
|
df["Genre 2"].cat.categories,
|
||||||
|
df["Genre 3"].cat.categories,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
df.drop(columns=["Genre"], inplace=True)
|
||||||
|
|
||||||
|
df = df[
|
||||||
|
df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]]
|
||||||
|
.notnull()
|
||||||
|
.all(1)
|
||||||
|
]
|
||||||
|
print(f"number of movies after cleaning: {len(df.index)}")
|
||||||
|
|
||||||
|
with open("./out/categoricals.csv", "w") as f:
|
||||||
|
f.write(df.to_csv())
|
||||||
|
|
||||||
|
return df
|
||||||
@@ -1,54 +1,19 @@
|
|||||||
|
import os, csv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
import statsmodels.formula.api as smf
|
import statsmodels.formula.api as smf
|
||||||
from patsy import dmatrices
|
from patsy import dmatrices
|
||||||
|
from categorize import categorize
|
||||||
|
|
||||||
|
|
||||||
df = pd.read_csv("./data/imdb_top_2000_movies.csv", index_col=0)
|
if not os.path.exists("./out"):
|
||||||
print(f"initial number of movies: {len(df.index)}")
|
os.makedirs("./out")
|
||||||
|
|
||||||
|
if not os.path.exists("./out/categoricals.csv"):
|
||||||
df = df[df[["Metascore"]].notnull().all(1)]
|
df = categorize()
|
||||||
|
else:
|
||||||
df["Gross"] = df["Gross"].str.replace(r"\D", "", regex=True).astype(float) * 1e6
|
df = pd.read_csv("./out/categoricals.csv", index_col=0)
|
||||||
df["Release Year"] = df["Release Year"].str.replace(r"\D", "", regex=True).str[:4]
|
|
||||||
|
|
||||||
df["Duration"] = df["Duration"].astype(int)
|
|
||||||
df["Votes"] = df["Votes"].str.replace(r"\D", "", regex=True).astype(int)
|
|
||||||
|
|
||||||
df["IMDB Rating"] = df["IMDB Rating"].astype(float)
|
|
||||||
df["Metascore"] = df["Metascore"].astype(int)
|
|
||||||
|
|
||||||
|
|
||||||
df["Director"] = pd.Categorical(df["Director"])
|
|
||||||
df["Cast"] = pd.Categorical(df["Cast"])
|
|
||||||
df["Release Year"] = pd.Categorical(df["Release Year"])
|
|
||||||
|
|
||||||
director_cat = np.array(df["Director"].cat.categories)
|
|
||||||
actor_cat = np.array(df["Cast"].cat.categories)
|
|
||||||
release_year_cat = np.array(df["Release Year"].cat.categories)
|
|
||||||
|
|
||||||
df[["Genre 1", "Genre 2", "Genre 3"]] = (
|
|
||||||
df["Genre"].str.split(", ", expand=True).astype("category")
|
|
||||||
)
|
|
||||||
|
|
||||||
genre_cat = np.unique(
|
|
||||||
np.concatenate(
|
|
||||||
[
|
|
||||||
df["Genre 1"].cat.categories,
|
|
||||||
df["Genre 2"].cat.categories,
|
|
||||||
df["Genre 3"].cat.categories,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
df.drop(columns=["Genre"], inplace=True)
|
|
||||||
|
|
||||||
df = df[
|
|
||||||
df[["Gross", "IMDB Rating", "Metascore", "Release Year", "Votes"]].notnull().all(1)
|
|
||||||
]
|
|
||||||
print(f"number of movies after cleaning: {len(df.index)}")
|
|
||||||
|
|
||||||
|
|
||||||
target = "Gross"
|
target = "Gross"
|
||||||
@@ -75,3 +40,6 @@ model = sm.OLS(y, X)
|
|||||||
results = model.fit()
|
results = model.fit()
|
||||||
|
|
||||||
print(results.summary())
|
print(results.summary())
|
||||||
|
|
||||||
|
with open("./out/model_summary.csv", "w") as f:
|
||||||
|
f.write(results.summary().as_csv())
|
||||||
|
|||||||
Reference in New Issue
Block a user