Leveraging Combine Data Using Machine Learning
This Python script, designed for a Google Colab environment, performs a comprehensive analysis of NFL Combine data from 2010 to 2023 to explore player performance metrics and predict draft outcomes. It begins by loading and preprocessing the dataset, handling missing values, converting height measurements, and grouping positions into broader categories. The code then conducts exploratory data analysis (EDA) with visualizations like histograms, scatter plots, and heatmaps to uncover relationships between physical attributes (e.g., 40-yard dash, bench press) and draft results. It applies dimensionality reduction techniques (PCA and SelectKBest) and clustering methods (K-Means and Hierarchical) to identify player archetypes, followed by predictive modeling using Random Forest, XGBoost, Logistic Regression, and a Deep Neural Network (DNN) to forecast draft rounds and pick numbers. Additional techniques like Lasso regression for feature selection and ensemble methods enhance model performance. The updated dataset is saved for future use, and the script credits Rohan Madhur, Alexander Mollohan, and Jonathan David for their contributions to its development.
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, silhouette_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from scipy.cluster.hierarchy import dendrogram, linkage
from google.colab import files
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.cluster import DBSCAN
# --- Data Loading ---
# Upload the file in Google Colab (interactive step, uncomment when running in Colab)
# uploaded = files.upload()
# Load the NFL Combine dataset from 2010 to 2023
file_path = "nfl_combine_2010_to_2023.csv"
df = pd.read_csv(file_path)
# Display the first 5 rows of the dataset to inspect it
df.head()
# Show basic information about the dataset (data types, non-null counts)
df.info()
# Create a copy of the dataset for cleaning and modifications
df_cleaned = df.copy()
# --- Data Preprocessing ---
# Define a mapping to group NFL positions into broader categories
position_mapping = {
'QB': 'Skill Positions', 'WR': 'Skill Positions', 'RB': 'Skill Positions', 'TE': 'Skill Positions',
'S': 'Secondary', 'CB': 'Secondary', 'DB': 'Secondary',
'DE': 'Pass Rushers', 'EDGE': 'Pass Rushers', 'OLB': 'Pass Rushers',
'DT': 'D Line',
'ILB': 'Linebacker', 'LB': 'Linebacker',
'OG': 'O Line', 'OT': 'O Line', 'FB': 'O Line', 'LS': 'O Line', 'C': 'O Line', 'OL': 'O Line',
'P': 'Special Teams', 'K': 'Special Teams'
}
# Create a new column 'Position Group' by mapping the 'Pos' column using the defined dictionary
df_cleaned['Position Group'] = df_cleaned['Pos'].map(position_mapping)
# Drop the 'Drafted' column as it’s redundant (NaN in 'Pick' indicates undrafted)
df_cleaned.drop("Drafted", axis=1, inplace=True)
# Fill missing 'Round' with 8 (undrafted) and 'Pick' with 300 (beyond draft range)
df_cleaned.update(df_cleaned['Round'].fillna(8))
df_cleaned.update(df_cleaned[['Pick']].fillna(300))
# Check for missing values in the original dataset
df.isna().sum()
# Fill missing 'Height' values with "0-0" as a placeholder
df_cleaned['Height'] = df_cleaned['Height'].fillna("0-0")
# Define a function to convert height from "feet-inches" format to total inches
def convert_height(height_str):
if pd.isna(height_str):
return None
feet, inches = height_str.split('-')
return int(feet) * 12 + int(inches)
# Apply the height conversion function to the 'Height' column
df_cleaned["Height"] = df_cleaned["Height"].apply(convert_height)
# Check for remaining missing values after height conversion
df_cleaned.isna().sum()
# List of performance metric columns to impute with median by position
impute_cols = ['Weight', '40yd', 'Vertical', 'Bench', 'Broad Jump', '3Cone', 'Shuttle']
# Impute missing values in performance metrics with median per position
for col in impute_cols:
if col in ['3Cone', 'Shuttle']:
# For kickers (K), set missing 3Cone and Shuttle to -1 (did not compete)
df_cleaned.loc[(df_cleaned['Pos'] == 'K') & (df_cleaned[col].isna()), col] = -1
# For other positions, fill with median by position (handle NaN medians)
df_cleaned[col] = df_cleaned.groupby('Pos')[col].transform(lambda x: x.fillna(x.median() if not pd.isna(x.median()) else np.nan))
else:
# For other columns, impute with median by position
df_cleaned[col] = df_cleaned.groupby('Pos')[col].transform(lambda x: x.fillna(x.median() if not pd.isna(x.median()) else np.nan))
# Define a function to count outliers using the Interquartile Range (IQR) method
def count_outliers(series):
if series.dtype in [np.float64, np.int64]: # Apply only to numerical columns
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return ((series < lower_bound) | (series > upper_bound)).sum()
return None
# Compute outliers and missing values per column after imputation
outliers_per_column = df_cleaned.apply(count_outliers)
na_per_column = df_cleaned.isna().sum()
# Combine results into a summary DataFrame
summary = pd.DataFrame({
'Outliers': outliers_per_column,
'Missing Values': na_per_column
}).dropna().astype(int)
# Print summary of outliers and missing values
print("Outlier and Missing Value Summary After Imputation:")
print(summary)
# Check for remaining NaNs and print first 10 rows with missing values per column
if summary['Missing Values'].sum() > 0:
print("\nFirst 10 Missing Value Rows for Each Column with NaNs:")
for col in summary[summary['Missing Values'] > 0].index:
print(f"\nColumn: {col}")
print(df_cleaned[df_cleaned[col].isna()].head(10))
# --- Feature Scaling ---
# List of numerical features to standardize
numerical_features = ['Height', 'Weight', '40yd', 'Vertical', 'Bench', 'Broad Jump', '3Cone', 'Shuttle']
scaler = StandardScaler()
df_cleaned[numerical_features] = scaler.fit_transform(df_cleaned[numerical_features])
# --- One-Hot Encoding ---
# Duplicate 'Pos' and 'Position Group' for encoding
df_cleaned['Pos1'] = df_cleaned['Pos']
df_cleaned['Position Group1'] = df_cleaned['Position Group']
# Perform one-hot encoding on duplicated columns
pos_dummies = pd.get_dummies(df_cleaned['Pos1'], prefix='Pos', drop_first=False)
position_group_dummies = pd.get_dummies(df_cleaned['Position Group1'], prefix='Position Group', drop_first=False)
# Concatenate the dummy variables with the original DataFrame
df_cleaned = pd.concat([df_cleaned, pos_dummies, position_group_dummies], axis=1)
# Drop the temporary duplicated columns
df_cleaned = df_cleaned.drop(['Pos1', 'Position Group1'], axis=1)
# Print column names to verify
print(df_cleaned.columns)
# --- Exploratory Data Analysis (EDA) ---
# Select only numerical columns for correlation matrix
numeric_df = df_cleaned.select_dtypes(include=[np.number])
# Compute correlation matrix
correlation_matrix = numeric_df.corr()
# Plot heatmap of correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()
# List of metrics for distribution visualization
metrics = ['40yd', 'Bench', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle']
# Plot histograms for each metric
plt.figure(figsize=(14, 10))
for i, col in enumerate(metrics, 1):
plt.subplot(2, 3, i)
sns.histplot(df_cleaned[col], bins=20, kde=True, color='blue')
plt.title(f"Distribution of {col}")
plt.xlabel(col)
plt.ylabel("Count")
plt.tight_layout()
plt.show()
# Scatter plot: Bench Press vs. Weight
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Weight', y='Bench', data=df_cleaned, alpha=0.7)
plt.title("Bench Press Reps vs. Weight")
plt.xlabel("Weight (lbs)")
plt.ylabel("Bench Press Reps")
plt.show()
# Scatter plot: 40-Yard Dash vs. Vertical Jump
plt.figure(figsize=(8, 6))
sns.scatterplot(x='40yd', y='Vertical', data=df_cleaned, alpha=0.7, color='red')
plt.title("40-Yard Dash vs. Vertical Jump")
plt.xlabel("40-Yard Dash Time (seconds)")
plt.ylabel("Vertical Jump (inches)")
plt.show()
# Bar chart: Player counts by Position Group
plt.figure(figsize=(10, 5))
sns.countplot(y='Position Group', data=df_cleaned, palette='viridis', order=df_cleaned['Position Group'].value_counts().index)
plt.title("Number of Players by Position Group")
plt.xlabel("Count")
plt.ylabel("Position Group")
plt.show()
# Boxplot: 40-Yard Dash times by Position Group
plt.figure(figsize=(12, 6))
sns.boxplot(x='Position Group', y='40yd', data=df_cleaned, palette='coolwarm')
plt.xticks(rotation=45)
plt.title("40-Yard Dash Times by Position Group")
plt.ylabel("40-Yard Dash Time (seconds)")
plt.show()
# --- Dimensionality Reduction ---
# 1. Principal Component Analysis (PCA)
# Use standardized numerical features for PCA
X_pca = df_cleaned[numerical_features].dropna() # Drop rows with NaNs
pca = PCA(n_components=0.95) # Retain 95% of variance
X_pca_transformed = pca.fit_transform(X_pca)
# Print PCA results
print("Explained Variance Ratio per Component:", pca.explained_variance_ratio_)
print("Cumulative Explained Variance Ratio:", np.cumsum(pca.explained_variance_ratio_))
print(f"Number of Components Selected: {pca.n_components_}")
# Add PCA components to the DataFrame
pca_columns = [f'PC{i+1}' for i in range(pca.n_components_)]
df_cleaned[pca_columns] = pd.DataFrame(X_pca_transformed, index=X_pca.index)
# 2. Feature Selection with SelectKBest
# Define features including numerical and one-hot encoded columns
features = numerical_features + [col for col in df_cleaned.columns if col.startswith('Pos_') or col.startswith('Position Group_')]
X = df_cleaned[features].dropna() # Features
y = df_cleaned['Round'].dropna() # Target (draft round)
# Align X and y indices after dropping NaNs
common_index = X.index.intersection(y.index)
X = X.loc[common_index]
y = y.loc[common_index]
# Select top 10 features using ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
# Get names of selected features
selected_mask = selector.get_support()
selected_features = [features[i] for i in range(len(features)) if selected_mask[i]]
print("Top 10 Selected Features:", selected_features)
# --- Clustering Analysis ---
# Define range of clusters to test for K-Means
K_range = range(2, 10)
# Store inertia (WCSS) and silhouette scores
inertia = []
silhouette_scores = []
# Use numerical features for clustering
X_cluster = df_cleaned[numerical_features].dropna()
# Evaluate K-Means for different numbers of clusters
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_cluster)
inertia.append(kmeans.inertia_) # Within-cluster sum of squares
silhouette_scores.append(silhouette_score(X_cluster, labels)) # Cluster cohesion/separation
# Plot Elbow Method (Inertia)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(K_range, inertia, marker='o', linestyle='--')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("WCSS (Inertia)")
plt.title("Elbow Method for Optimal K")
# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, marker='s', linestyle='--')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Optimal K")
plt.tight_layout()
plt.show()
# Determine optimal K based on silhouette score
best_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters based on silhouette score: {best_k}")
# --- K-Means Clustering ---
# Apply K-Means with 5 clusters (can adjust based on elbow/silhouette)
kmeans = KMeans(n_clusters=5, random_state=42)
df_cleaned.loc[X_cluster.index, 'KMeans_Cluster'] = kmeans.fit_predict(X_cluster)
# Visualize K-Means clusters (40yd vs. Vertical)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='40yd', y='Vertical', hue='KMeans_Cluster', data=df_cleaned, palette='deep')
plt.title("K-Means Clusters: 40-Yard Dash vs. Vertical Jump")
plt.show()
# --- Hierarchical Clustering ---
# Perform hierarchical clustering using Ward’s method
Z = linkage(X_cluster, method='ward')
# Plot dendrogram to visualize hierarchy
plt.figure(figsize=(10, 7))
dendrogram(Z, truncate_mode='level', p=5) # Show top 5 levels
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()
# Apply Agglomerative Clustering with 5 clusters
hierarchical = AgglomerativeClustering(n_clusters=5, linkage='ward')
df_cleaned.loc[X_cluster.index, 'Hierarchical_Cluster'] = hierarchical.fit_predict(X_cluster)
# --- Random Forest Classification ---
# Remove dummy variables for cleaner modeling
columns_to_remove = ['Pos_C', 'Pos_CB', 'Pos_DB', 'Pos_DE', 'Pos_DL',
'Pos_DT', 'Pos_EDGE', 'Pos_FB', 'Pos_ILB', 'Pos_K', 'Pos_LB', 'Pos_LS',
'Pos_OG', 'Pos_OL', 'Pos_OLB', 'Pos_OT', 'Pos_P', 'Pos_QB', 'Pos_RB',
'Pos_S', 'Pos_TE', 'Pos_WR', 'Position Group_D Line',
'Position Group_Linebacker', 'Position Group_O Line',
'Position Group_Pass Rushers', 'Position Group_Secondary',
'Position Group_Skill Positions', 'Position Group_Special Teams']
df_nodummy = df_cleaned.drop(columns=columns_to_remove, errors='ignore')
# Filter selected features to exclude dummy variables
selected_features_nodummy = [col for col in selected_features if col not in columns_to_remove]
# Prepare data for Random Forest
X_rf = df_nodummy[selected_features_nodummy].dropna()
y_rf = df_nodummy.loc[X_rf.index, 'Round']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate Random Forest performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
'Feature': selected_features_nodummy,
'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)
# --- Advanced Modeling ---
# Create interaction terms for additional features
df_cleaned['Weight_40yd'] = df_cleaned['Weight'] * df_cleaned['40yd']
df_cleaned['Bench_Vertical'] = df_cleaned['Bench'] * df_cleaned['Vertical']
# Standardize features for scaled models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_rf)
# Split scaled data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_rf, test_size=0.2, random_state=42)
# Define hyperparameter grid for Random Forest tuning
param_dist = {
'n_estimators': [100, 200, 300, 400, 500],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Perform RandomizedSearchCV for hyperparameter tuning
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
param_distributions=param_dist,
n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)
# Print best parameters from tuning
print("Best parameters found: ", rf_random.best_params_)
# Map draft rounds (1-8) to classes (0-7) for XGBoost compatibility
class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
y_train = y_train.map(class_mapping)
y_test = y_test.map(class_mapping)
# Train XGBoost Classifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
# Ensemble Learning with Voting Classifier (Random Forest + XGBoost)
ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model)], voting='soft')
ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)
# Print ensemble model accuracy
accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
# Note: SMOTE section seems incomplete (y_pred_resampled undefined). Adding a placeholder comment
# If SMOTE is intended, it would oversample minority classes before training
# Example: smote = SMOTE(random_state=42); X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
# --- Regression for Draft Pick Prediction ---
# Create a copy for regression tasks
df_reg = df_cleaned.copy()
# Select numeric features, excluding target variables
X_reg = df_reg.select_dtypes(include=['number']).drop(columns=['Pick', 'Round'])
y_reg = df_reg['Pick']
# Train-test split for regression
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
# Scale features for regression
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=500, max_depth=15, min_samples_leaf=3, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.2f}")
# Train XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.2f}")
# Ensemble predictions by averaging
y_pred_ensemble = (y_pred_rf + y_pred_xgb) / 2
print(f"Ensemble MAE: {mean_absolute_error(y_test, y_pred_ensemble):.2f}")
# --- Logistic Regression for Drafted vs. Undrafted ---
# Create a copy for logistic regression
df_logreg = df_cleaned.copy()
# Create binary 'Drafted' column (1 = drafted, 0 = undrafted)
df_logreg['Drafted'] = (df_logreg['Round'] < 8).astype(int)
# Select numeric features, excluding targets
X_class = df_logreg.select_dtypes(include=['number']).drop(columns=['Pick', 'Round', 'Drafted'])
y_class = df_logreg['Drafted']
# Train-test split for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
# Train Logistic Regression with scaling
log_reg = make_pipeline(StandardScaler(), LogisticRegression(solver='saga', max_iter=2000, random_state=42))
log_reg.fit(X_train_class, y_train_class)
# Make predictions
y_pred_class = log_reg.predict(X_test_class)
# Evaluate Logistic Regression
print("Drafted vs. Undrafted Classification")
print(f"Accuracy: {accuracy_score(y_test_class, y_pred_class):.2%}")
print(classification_report(y_test_class, y_pred_class))
# --- Lasso Regression for Feature Selection ---
# Train Lasso model for feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
# Convert X_train to DataFrame if it’s not already
if not isinstance(X_train, pd.DataFrame):
X_train = pd.DataFrame(X_train)
# Extract important features from Lasso coefficients
lasso_coeffs = pd.Series(lasso.coef_, index=X_train.columns)
important_features = lasso_coeffs[lasso_coeffs != 0].sort_values(ascending=False)
print("\nKey Performance Metrics (Lasso Feature Selection)")
print(important_features)
# Visualize important features
plt.figure(figsize=(10, 5))
important_features.plot(kind="bar")
plt.title("Lasso-Selected Important Features for Draft Pick Prediction")
plt.show()
# --- Additional Clustering ---
# Create a copy for clustering
df_cluster = df_cleaned.copy()
# Use all numeric features for clustering
X_cluster = df_cluster.select_dtypes(include=['number'])
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)
# Evaluate optimal number of clusters
scores = []
inertia = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertia.append(kmeans.inertia_)
scores.append(silhouette_score(X_scaled, kmeans.labels_))
# Plot Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), inertia, marker='o')
plt.title("Elbow Method (Inertia) for Different Number of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()
# Plot Silhouette Scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), scores, marker='o')
plt.title("Silhouette Scores for Different Number of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.show()
# Set number of clusters based on analysis (example: 5)
num_clusters = 5
# Apply PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Fit K-Means on PCA-transformed data
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df_cluster['Cluster_KMeans'] = kmeans.fit_predict(X_pca)
# Visualize K-Means clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_cluster['Cluster_KMeans'], palette='viridis', alpha=0.7)
plt.title("Player Archetypes (K-Means Clustering with PCA)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()
# Apply Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=num_clusters)
df_cluster['Cluster_Agglo'] = agglo.fit_predict(X_scaled)
# Visualize Agglomerative Clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df_cluster['Cluster_Agglo'], palette='viridis', alpha=0.7)
plt.title("Player Archetypes (Agglomerative Clustering)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.show()
# Print sample of clustered players
print("\nClustered Player Archetypes Assigned (K-Means):")
print(df_cluster[['Player', 'Pos', 'Cluster_KMeans']].head(10))
print("\nClustered Player Archetypes Assigned (Agglomerative):")
print(df_cluster[['Player', 'Pos', 'Cluster_Agglo']].head(10))
# --- Save Updated Data ---
# Define new file path for saving
new_file_path = file_path.replace('.csv', '_updated_data.csv')
df.to_csv(new_file_path, index=False)
print(f"Updated file saved at: {new_file_path}")
# --- Deep Neural Network (DNN) ---
# Create a copy for DNN
df_dnn = df_cleaned.copy()
# Map draft rounds to 0-7 for classification
class_mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
df_dnn['Round_mapped'] = df_dnn['Round'].map(class_mapping)
# Drop non-numeric and target columns
X = df_dnn.drop(['Year', 'Player', 'Pos', 'School', 'Round', 'Pick', 'Round_mapped', 'Position Group'], axis=1)
y = df_dnn['Round_mapped']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features for DNN
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Import TensorFlow for DNN
import tensorflow as tf
from tensorflow.keras import layers, models
# Build DNN model with SELU activation and AlphaDropout
model = models.Sequential()
model.add(layers.Dense(64, activation='selu', kernel_initializer='lecun_normal', input_shape=(X_train.shape[1],)))
model.add(layers.AlphaDropout(0.1))
model.add(layers.Dense(32, activation='selu', kernel_initializer='lecun_normal'))
model.add(layers.AlphaDropout(0.1))
model.add(layers.Dense(8, activation='softmax')) # 8 classes for rounds 0-7
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)
# Evaluate on test set
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", test_acc)
# Make predictions for classification report
y_pred_probs = model.predict(X_test)
y_pred = tf.argmax(y_pred_probs, axis=1)
# Print classification report
print(classification_report(y_test, y_pred))
# --- PCA Variance Analysis for DNN Features ---
# Apply PCA to training data
pca = PCA()
pca.fit(X_train)
# Plot cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA: Explained Variance by Components')
plt.grid()
plt.show()
Text Data Vectorization and Labeling with CountVectorizer
This script efficiently processes a collection of text documents by reading files from a designated directory and categorizing them into two distinct groups: “Baseball” and “Videogames.” Utilizing the CountVectorizer from the scikit-learn library, the text is transformed into a matrix of token counts while disregarding common English stop words. The resulting matrix is then converted into a pandas DataFrame, with an additional column for document labels. The final output is saved as a CSV file, facilitating further analysis and machine learning applications.
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
corpus_path = r"C:\Users\cyase\OneDrive\Documents\Corpus"
file_names = ["Baseball1.txt", "Baseball2.txt", "Baseball3.txt", "Videogames1.txt", "Videogames2.txt", "Videogames3.txt"]
labels = ["Baseball", "Baseball", "Baseball", "Videogames", "Videogames", "Videogames"]
documents = [open(os.path.join(corpus_path, file_name), 'r', encoding='utf-8').read() for file_name in file_names]
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=[os.path.splitext(file)[0] for file in file_names])
df.insert(0, 'LABEL', labels)
output_csv_path = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\corpus_data_labeled.csv"
df.to_csv(output_csv_path, index=True)
Text Data Cleaning, Vectorization, Clustering, and Visualization Using TF-IDF and Truncated SVD
This Python script processes a labeled text dataset through a comprehensive workflow, including cleaning, vectorizing, splitting, and clustering the text data for advanced analysis. The script begins by loading a CSV file containing labeled text, then it cleans the data by eliminating URLs, numerals, and non-alphabetical characters. It visualizes the distribution of labels and employs CountVectorizer to create a sparse matrix representation with a maximum of 5,000 features. Following a TF-IDF transformation, the data is split into training and testing sets while maintaining class balance. To efficiently store large sparse matrices, the script exports them in the Matrix Market format using SciPy’s mmwrite function. For clustering, MiniBatchKMeans is applied to group the data into five clusters, and the cluster labels are added to a DataFrame containing the training text and labels. Finally, Truncated SVD is utilized for dimensionality reduction, enabling visualization of the clusters in a scatter plot created with seaborn.
import re # regular expressions
import pandas as pd # for dataframes
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans # Use MiniBatchKMeans for large datasets
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix # Use scipy sparse matrices directly
# Use the provided paths
input_filepath = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\emotions_full.csv"
output_basepath = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining"
# Load the dataset
print("Loading dataset...")
df = pd.read_csv(input_filepath, usecols=[1, 2])
labels = df['label']
texts = df['text']
# Data cleaning
print("Cleaning data...")
texts = texts.str.replace(r'http\S+' or r'http[s]?://\S+', '', regex=True) # Remove URLs
texts = texts.str.replace(r'\b\w*\d+\w*\b', '', regex=True) # Remove numerals and words with numerals
texts = texts.str.replace(r'[^a-zA-Z\s]', '', regex=True) # Remove punctuation and non-alphabetical characters
# Exploratory Analysis: Visualize label distribution
print("Visualizing label distribution...")
plt.figure(figsize=(8, 6))
sns.countplot(x=labels)
plt.title("Label Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Vectorization
print("Vectorizing text data...")
MyCountV = CountVectorizer(input="content", lowercase=True, stop_words="english", max_features=5000) # Reduced max_features to manage memory
tokens = MyCountV.fit_transform(texts)
# Apply TF-IDF Transformation directly on sparse matrix
print("Applying TF-IDF transformation...")
tfidf_transformer = TfidfTransformer()
df_tfidf_sparse = tfidf_transformer.fit_transform(tokens)
# Create train/test split using sparse matrices
print("Splitting data into train and test sets...")
labels_array = labels.values
df_tfidf_train, df_tfidf_test, labels_train, labels_test, texts_train, texts_test = train_test_split(
df_tfidf_sparse, labels_array, texts, test_size=0.2, random_state=42, stratify=labels_array
)
# Save to CSV using sparse format only when exporting to avoid memory overflow
print("Exporting train/test sets to CSV...")
# Use CSR matrix for efficient sparse matrix format
df_tfidf_train_csr = csr_matrix(df_tfidf_train)
df_tfidf_test_csr = csr_matrix(df_tfidf_test)
# Save train and test sets using scipy's sparse matrix I/O
from scipy.io import mmwrite
mmwrite(f"{output_basepath}\\df_tfidf_train.mtx", df_tfidf_train_csr)
mmwrite(f"{output_basepath}\\df_tfidf_test.mtx", df_tfidf_test_csr)
# Cluster Analysis using MiniBatchKMeans for scalability
print("Performing cluster analysis...")
kmeans = MiniBatchKMeans(n_clusters=5, n_init='auto', random_state=42) # Use MiniBatchKMeans for large datasets
kmeans.fit(df_tfidf_train_csr) # Fit on train data only to mimic real-world scenarios
# Add cluster labels to a DataFrame for visualization
print("Preparing data for visualization...")
df_clusters = pd.DataFrame({
'text': texts_train.values,
'label': labels_train,
'cluster': kmeans.labels_
})
# Use TruncatedSVD instead of PCA for visualization (on sparse data)
print("Performing TruncatedSVD for visualization...")
svd = TruncatedSVD(n_components=2, random_state=42)
svd_result = svd.fit_transform(df_tfidf_train_csr)
df_clusters['svd1'] = svd_result[:, 0]
df_clusters['svd2'] = svd_result[:, 1]
plt.figure(figsize=(10, 8))
sns.scatterplot(x='svd1', y='svd2', hue='cluster', data=df_clusters, palette='Set1', alpha=0.6)
plt.title("Truncated SVD Visualization of Clusters")
plt.xlabel("SVD Component 1")
plt.ylabel("SVD Component 2")
plt.legend()
plt.show()
print("Task completed.")
LDA-Based Topic Modeling on Text Data With Visualizations
This Python script conducts topic modeling on a set of text documents using Latent Dirichlet Allocation (LDA). It begins by loading and preprocessing the text data, removing stop words, numerals, and non-alphabetic characters with NLTK libraries. After tokenizing the cleaned text, the script employs CountVectorizer to generate a term-document matrix. Using the vectorized data, LDA is applied to identify five distinct topics, showcasing the top words associated with each topic. The script provides various visualizations, including bar charts of top words, word clouds, and a heatmap depicting the distribution of topics across documents, offering valuable insights into the predominant themes within the text.
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from wordcloud import WordCloud
import seaborn as sns
# Download NLTK resources if not already available
nltk.download('stopwords')
nltk.download('punkt')
# Define a function to remove stop words
def remove_stop_words(text):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
return ' '.join(filtered_tokens)
# Define the path to your unzipped directory
base_path = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\110"
# List of subfolders to process
subfolders = ['110-m-d', '110-m-r', '110-f-d', '110-f-r']
all_texts = []
# Process each subfolder
for subfolder in subfolders:
subfolder_path = os.path.join(base_path, subfolder)
if not os.path.exists(subfolder_path):
print(f"Path does not exist: {subfolder_path}")
continue
# Walk through each file in the subfolder
for root, dirs, files in os.walk(subfolder_path):
for file in files:
file_path = os.path.join(root, file)
print(f"Loading data from {file_path}...")
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read().strip()
if text: # Ensure it's not empty
cleaned_text = remove_stop_words(text)
if cleaned_text: # Ensure it's not empty after stop words removal
all_texts.append(cleaned_text)
else:
print(f"File is empty: {file_path}")
except Exception as e:
print(f"Error reading {file_path}: {e}")
# Combine the data into a DataFrame
print("Combining all texts from all subfolders...")
data = pd.DataFrame({'text': all_texts})
# Print a sample of the data
print("Data sample before vectorization:")
print(data['text'].head(10))
print(f"Number of documents: {len(data)}")
# Ensure all data is string and filter out very short documents
print("Ensuring all data in 'text' column is string and filtering out short texts...")
data['text'] = data['text'].astype(str)
data = data[data['text'].str.len() > 10]
print(f"Number of documents after filtering: {len(data)}")
# Vectorize the text data using CountVectorizer
print("Vectorizing the text data...")
vectorizer = CountVectorizer(stop_words='english', token_pattern=r'\b\w+\b')
try:
X = vectorizer.fit_transform(data['text'])
print(f"Number of features: {X.shape[1]}")
except ValueError as e:
print(f"Error: {e}")
# Check if X was successfully created
if 'X' not in locals():
print("X is not defined. Exiting.")
exit()
# Run LDA with 5 topics
n_topics = 5
print(f"Running LDA with {n_topics} topics...")
lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=100, learning_method='online')
lda_model.fit(X)
# Display the top words for each topic
print(f"Top words for each of the {n_topics} topics:")
feature_names = vectorizer.get_feature_names_out()
for index, topic in enumerate(lda_model.components_):
print(f"Topic #{index+1}:")
top_words = [feature_names[i] for i in topic.argsort()[-10:]]
print(" ".join(top_words))
# Visualization with Matplotlib: Bar charts for topics
print("Generating bar charts for topics...")
for index, topic in enumerate(lda_model.components_):
plt.figure(figsize=(10, 5))
top_words = [feature_names[i] for i in topic.argsort()[-10:]]
top_word_counts = [topic[i] for i in topic.argsort()[-10:]]
plt.barh(top_words, top_word_counts, color='skyblue')
plt.xlabel('Importance')
plt.title(f'Top Words for Topic #{index+1}')
plt.gca().invert_yaxis()
plt.show()
# Generate WordCloud for each topic
print("Generating word clouds for topics...")
for index, topic in enumerate(lda_model.components_):
word_freq = {feature_names[i]: topic[i] for i in topic.argsort()[-30:]}
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for Topic #{index+1}')
plt.show()
# Generate Heatmap of Topic Distribution across Documents
print("Generating heatmap for topic distribution across documents...")
doc_topic_dist = lda_model.transform(X)
plt.figure(figsize=(10, 8))
sns.heatmap(doc_topic_dist.T, cmap='Blues', cbar=True, yticklabels=[f'Topic {i+1}' for i in range(n_topics)])
plt.xlabel('Document Index')
plt.ylabel('Topic')
plt.title('Topic Distribution Across Documents')
plt.show()
Text Preprocessing and Word Vectorization for Review Analysis
This Python script processes a CSV file containing text reviews and corresponding labels to prepare the data for further analysis. It begins by loading the data, removing unnamed columns, and dropping any rows with missing values. The script standardizes the labels by renaming columns. Utilizing CountVectorizer, it transforms the text reviews into a word frequency matrix, representing each word as a feature. The resulting word frequency matrix is then merged with the original labels, creating a cleaned dataset that is saved to a new CSV file for subsequent analysis.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
file_path = 'C:/Users/cyase/OneDrive/Documents/Syracuse Masters/Text Mining/T_F_Reviews.csv'
df = pd.read_csv(file_path)
print(df.head())
df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)
df.dropna(inplace=True)
df.rename(columns={'lie': 'LABEL', 'review': 'REVIEW'}, inplace=True)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['REVIEW'])
word_matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df = pd.concat([df.drop(columns=['REVIEW']), word_matrix], axis=1)
print(df.head())
cleaned_file_path = 'C:/Users/cyase/OneDrive/Documents/Syracuse Masters/T_F_Reviews_Cleaned.csv'
df.to_csv(cleaned_file_path, index=False)
Deception Detection and Sentiment Analysis Using Naive Bayes
This script implements text preprocessing and builds classification models for sentiment analysis and deception detection. It starts by reading a dataset containing labels for deception and sentiment, creating a combined text column (full_review) and removing non-alphabetic characters while converting text to lowercase. Stop words are eliminated using NLTK. The cleaned dataset is saved to an Excel file and reloaded. The data is split into two tasks: sentiment classification and lie detection. A function, train_and_evaluate_model, vectorizes the text data with CountVectorizer, trains a MultinomialNB model, and evaluates its performance using 10-fold cross-validation. The script prints the mean accuracy and standard deviation for each model, alongside the 20 most indicative words based on log probabilities. Finally, results for both models are visualized with a bar chart using Seaborn, showcasing accuracy and error variability.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
file_path = "C:/Users/cyase/OneDrive/Documents/Syracuse Masters/Text Mining/deception_data_two_labels.csv"
data = pd.read_csv(file_path)
data['full_review'] = data.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
data = data[['lie', 'sentiment', 'full_review']]
data['full_review'] = data['full_review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x).lower())
stop_words = set(stopwords.words('english'))
data['full_review'] = data['full_review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
data.dropna(inplace=True)
cleaned_file_path = "C:/Users/cyase/OneDrive/Documents/Syracuse Masters/Text Mining/cleaned_deception_data.xlsx"
data.to_excel(cleaned_file_path, index=False)
cleaned_data = pd.read_excel(cleaned_file_path)
sentiment_data = cleaned_data[['full_review', 'sentiment']]
lie_detection_data = cleaned_data[['full_review', 'lie']]
X_sentiment = sentiment_data['full_review']
y_sentiment = sentiment_data['sentiment']
X_lie_detection = lie_detection_data['full_review']
y_lie_detection = lie_detection_data['lie']
def train_and_evaluate_model(X, y, label_name):
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
model = MultinomialNB()
model.fit(X_vectorized, y)
scores = cross_val_score(model, X_vectorized, y, cv=10)
print(f"10-Fold Cross-Validation Accuracy for {label_name}: {np.mean(scores):.2f} (+/- {np.std(scores):.2f})")
feature_names = vectorizer.get_feature_names_out()
most_indicative = np.argsort(model.feature_log_prob_[1])[-20:]
indicative_words = [feature_names[i] for i in most_indicative]
print(f"20 Most Indicative Words for {label_name}: {indicative_words}")
return indicative_words
indicative_words_sentiment = train_and_evaluate_model(X_sentiment, y_sentiment, "Sentiment Classification")
indicative_words_lie_detection = train_and_evaluate_model(X_lie_detection, y_lie_detection, "Lie Detection")
models = ['Sentiment Classification', 'Lie Detection']
accuracies = [0.84, 0.62] # Mean accuracies
errors = [0.14, 0.10] # Standard deviations
data = pd.DataFrame({
'Model': models,
'Accuracy': accuracies,
'Error': errors
})
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Accuracy', data=data, ci=None, palette='muted')
plt.errorbar(x=data['Model'], y=data['Accuracy'], yerr=data['Error'], fmt='o', color='black', capsize=5)
plt.title('Accuracy Comparison of Sentiment Classification and Lie Detection Models')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.show()
News Article Fetcher and CSV Saver
This Python script utilizes the News API to gather news articles on specified topics and saves them in CSV files. It defines three key functions: fetch_articles to retrieve articles related to a given topic, articles_to_dataframe to filter relevant columns into a structured DataFrame, and save_to_csv to store both raw and cleaned data. The script fetches articles on ‘baseball,’ ‘football,’ and ‘olympics,’ saving them in a designated directory while printing confirmation messages. This approach streamlines the process of collecting, cleaning, and storing news data on relevant topics.
import requests
import pandas as pd
import os
API_KEY = '5321a6ec84ac49a09ffe07a2919accc5'
def fetch_articles(topic):
url = f'https://newsapi.org/v2/everything?q={topic}&apiKey={API_KEY}&language=en'
response = requests.get(url)
articles = response.json().get('articles', [])
return articles
def articles_to_dataframe(articles):
df = pd.DataFrame(articles)
return df[['source', 'author', 'title', 'description', 'url', 'publishedAt', 'content']]
def save_to_csv(df, filename):
output_dir = r'C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining'
os.makedirs(output_dir, exist_ok=True)
file_path = os.path.join(output_dir, filename)
df.to_csv(file_path, index=False)
topics = ['baseball', 'football', 'olympics']
for topic in topics:
articles = fetch_articles(topic)
raw_df = pd.DataFrame(articles)
cleaned_df = articles_to_dataframe(articles)
save_to_csv(raw_df, f'{topic}_articles_raw.csv')
save_to_csv(cleaned_df, f'{topic}_articles_cleaned.csv')
print("CSV files for raw and cleaned data have been created in the specified directory.")
Deception Detection using Text Classification
This Python script employs various text classification techniques to detect deception in reviews. It utilizes three machine learning algorithms: Bernoulli Naive Bayes, Multinomial Naive Bayes, and Decision Tree Classifier. The script loads a dataset of reviews labeled as either deceptive or truthful, preprocessing the text by removing punctuation, converting to lowercase, and eliminating stop words. The cleaned text is vectorized using CountVectorizer for each classifier. The script splits the data into training and testing sets, trains each model, and evaluates their performance with confusion matrices and accuracy scores. Additionally, it visualizes the decision tree, compares raw and cleaned data lengths, analyzes prediction errors, and examines feature importance, providing insights into the effectiveness of the models for deception detection.
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import string
from nltk.corpus import stopwords
import re
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
filepath = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\deception_data_two_labels.csv"
df = pd.read_csv(filepath)
# Assuming 'lie' is the label you want to predict; you can also use 'sentiment'
labels = df['lie']
# Load stopwords
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
# Remove punctuation and lowercase the text
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove words that are stopwords, have numbers, have length 2 or less, or are longer than 13 characters
text = ' '.join([
word for word in text.split()
if word not in stop_words
and len(word) > 2
and len(word) <= 13
and not re.search(r'\d', word)
])
return text
# Apply preprocessing to the 'review' text data
df['clean_review'] = df['review'].astype(str).apply(preprocess_text)
# Save cleaned data to a CSV file
cleaned_filepath = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\cleaned_deception_data.csv"
df.to_csv(cleaned_filepath, index=False)
# Vectorization for Bernoulli Naive Bayes
vectorizer_bernoulli = CountVectorizer(binary=True)
X_bernoulli = vectorizer_bernoulli.fit_transform(df['clean_review'])
# Vectorization for Multinomial Naive Bayes and Decision Tree
vectorizer_multinomial = CountVectorizer(binary=False)
X_multinomial = vectorizer_multinomial.fit_transform(df['clean_review'])
# Split the data into training and testing sets
X_train_bern, X_test_bern, y_train, y_test = train_test_split(X_bernoulli, labels, test_size=0.2, random_state=42)
X_train_multi, X_test_multi, _, _ = train_test_split(X_multinomial, labels, test_size=0.2, random_state=42)
# Bernoulli Naive Bayes
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_bern, y_train)
y_pred_bern = bernoulli_nb.predict(X_test_bern)
# Confusion Matrix and Accuracy
cm_bern = confusion_matrix(y_test, y_pred_bern)
accuracy_bern = accuracy_score(y_test, y_pred_bern)
# Multinomial Naive Bayes
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_multi, y_train)
y_pred_multi = multinomial_nb.predict(X_test_multi)
# Confusion Matrix and Accuracy
cm_multi = confusion_matrix(y_test, y_pred_multi)
accuracy_multi = accuracy_score(y_test, y_pred_multi)
# Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_multi, y_train)
y_pred_tree = decision_tree.predict(X_test_multi)
# Confusion Matrix and Accuracy
cm_tree = confusion_matrix(y_test, y_pred_tree)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
# Display confusion matrices and accuracies
print("Bernoulli Naive Bayes Confusion Matrix:\n", cm_bern)
print("Bernoulli Naive Bayes Accuracy: ", accuracy_bern)
print("\nMultinomial Naive Bayes Confusion Matrix:\n", cm_multi)
print("Multinomial Naive Bayes Accuracy: ", accuracy_multi)
print("\nDecision Tree Classifier Confusion Matrix:\n", cm_tree)
print("Decision Tree Classifier Accuracy: ", accuracy_tree)
# Plot Confusion Matrices with legends
plt.figure(figsize=(18, 6))
# Bernoulli Naive Bayes
plt.subplot(1, 3, 1)
sns.heatmap(cm_bern, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Bernoulli Naive Bayes")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.legend(["Truthful", "Deceptive"])
# Multinomial Naive Bayes
plt.subplot(1, 3, 2)
sns.heatmap(cm_multi, annot=True, fmt='d', cmap='Greens', cbar=False)
plt.title("Multinomial Naive Bayes")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.legend(["Truthful", "Deceptive"])
# Decision Tree Classifier
plt.subplot(1, 3, 3)
sns.heatmap(cm_tree, annot=True, fmt='d', cmap='Oranges', cbar=False)
plt.title("Decision Tree Classifier")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.legend(["Truthful", "Deceptive"])
# Adjust layout and show the plot
plt.tight_layout()
plt.show()
# Set up the plot with a larger figure size for Decision Tree visualization
plt.figure(figsize=(40, 20))
# Limit the depth of the tree to avoid clutter (you can adjust max_depth as needed)
plot_tree(decision_tree,
feature_names=vectorizer_multinomial.get_feature_names_out(),
class_names=['Truthful', 'Deceptive'],
filled=True,
rounded=True,
fontsize=10, # Font size to keep text readable
precision=2, # Adjust precision for numeric values
max_depth=3 # Limiting the depth for better clarity
)
# Add a title to the plot
plt.title("Decision Tree Visualization for Deception Detection", fontsize=20)
# Adjust layout to avoid overlap
plt.tight_layout()
# Display the plot
plt.show()
# Visualization: Raw vs. Cleaned Data
plt.figure(figsize=(12, 6))
# Raw Data Lengths
plt.subplot(1, 2, 1)
df['raw_length'] = df['review'].apply(lambda x: len(str(x).split()))
plt.hist(df['raw_length'], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Raw Review Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
# Cleaned Data Lengths
plt.subplot(1, 2, 2)
df['clean_length'] = df['clean_review'].apply(lambda x: len(str(x).split()))
plt.hist(df['clean_length'], bins=20, color='green', alpha=0.7)
plt.title('Distribution of Cleaned Review Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
# Adjust layout and show the plot
plt.tight_layout()
plt.show()
# Feature Importance for Decision Tree
feature_importances_tree = pd.DataFrame({
'Feature': vectorizer_multinomial.get_feature_names_out(),
'Importance': decision_tree.feature_importances_
})
feature_importances_tree = feature_importances_tree.sort_values(by='Importance', ascending=False).head(20)
# Plot the top 20 most important features for Decision Tree
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances_tree)
plt.title("Top 20 Feature Importance for Decision Tree")
plt.show()
# Error Analysis for Multinomial Naive Bayes
df_test_multi = df.iloc[y_test.index].copy() # Get the original test data
df_test_multi['Predicted'] = y_pred_multi # Add predicted labels
df_test_multi['Actual'] = y_test # Add actual labels
df_test_multi['Correct'] = df_test_multi['Predicted'] == df_test_multi['Actual'] # Add correctness
# Analyze errors
errors_multi = df_test_multi[df_test_multi['Correct'] == False]
print("\nMultinomial Naive Bayes Error Analysis:")
print(errors_multi[['review', 'Actual', 'Predicted']].head(10)) # Display first 10 errors
# Bar Plot of Error Counts
error_counts = errors_multi.groupby(['Actual', 'Predicted']).size().unstack(fill_value=0)
error_counts.plot(kind='bar', stacked=True, colormap='Reds', figsize=(10, 6))
plt.title("Error Counts by Actual vs. Predicted Labels")
plt.xlabel("Actual Labels")
plt.ylabel("Number of Errors")
plt.show()
# Distribution of Predicted Probabilities for Bernoulli Naive Bayes
y_prob_bern = bernoulli_nb.predict_proba(X_test_bern)[:, 1]
plt.figure(figsize=(10, 6))
sns.histplot(y_prob_bern, bins=20, kde=True)
plt.title("Distribution of Predicted Probabilities for Bernoulli Naive Bayes")
plt.xlabel("Predicted Probability of Being Deceptive")
plt.ylabel("Frequency")
plt.show()
Sentiment Analysis of Reviews Using Machine Learning Techniques
This code implements a sentiment analysis pipeline to classify text reviews as positive or negative through various machine learning models. It begins by loading positive and negative reviews from zipped files, then preprocesses the text to remove punctuation, convert to lowercase, and eliminate stop words. The cleaned text is visualized through word clouds, highlighting common terms associated with positive and negative sentiments. Feature extraction is conducted with CountVectorizer, limiting vocabulary size for efficiency. The dataset is split into training and testing sets, and multiple models, including Multinomial Naive Bayes and Support Vector Machines (SVMs), are trained and evaluated. The code optimizes SVM models using grid search for hyperparameter tuning, generating confusion matrices and classification reports for performance assessment. A bar plot compares the accuracy of the various models, illustrating the effectiveness of machine learning techniques in sentiment analysis.
import os
import pandas as pd
import zipfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
# File paths for positive and negative review zip files
pos_zip_path = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\pos-20240820T011722Z-001.zip"
neg_zip_path = r"C:\Users\cyase\OneDrive\Documents\Syracuse Masters\Text Mining\neg-20240820T011834Z-001.zip"
# Function to load reviews from a zip file
def load_reviews_from_zip(zip_path, label):
reviews = []
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for idx, filename in enumerate(zip_ref.namelist()):
with zip_ref.open(filename) as file:
reviews.append({'review': file.read().decode('utf-8'), 'sentiment': label})
return reviews
# Load positive and negative reviews from zip files
pos_reviews = load_reviews_from_zip(pos_zip_path, 'pos')
neg_reviews = load_reviews_from_zip(neg_zip_path, 'neg')
# Combine the datasets into a single DataFrame
df = pd.DataFrame(pos_reviews + neg_reviews)
# Text Preprocessing
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
text = text.lower()
text = ''.join([char for char in text if char not in string.punctuation])
text = ' '.join([word for word in text.split() if word not in stop_words])
return text
df['cleaned_review'] = df['review'].apply(preprocess_text)
# Feature Extraction using CountVectorizer
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment'].map({'pos': 1, 'neg': 0})
# Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Linear SVM model using LinearSVC
linear_svm_model = LinearSVC(max_iter=1000, verbose=1)
linear_svm_model.fit(X_train, y_train)
linear_svm_preds = linear_svm_model.predict(X_test)
# Evaluate the model
cm = confusion_matrix(y_test, linear_svm_preds)
print("Linear SVM Confusion Matrix:")
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Linear SVM Confusion Matrix')
plt.show()
print("Linear SVM Classification Report:")
print(classification_report(y_test, linear_svm_preds))
accuracy = accuracy_score(y_test, linear_svm_preds)
print(f"Linear SVM Accuracy: {accuracy:.4f}")
Exercise Metrics: Statistical and Predictive Analysis
This Python script applies statistical and predictive methods to analyze gym members’ exercise data, focusing on relationships between variables such as workout type, body composition, and fitness outcomes. Utilizing techniques like ANOVA, Kruskal-Wallis, Pearson correlation, and linear regression, it investigates the impact of various factors on calories burned and other metrics. The code also employs visualizations, including scatterplots, boxplots, and correlation heatmaps, to present findings in a mathematically driven and insightful manner.
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway, pearsonr, kruskal, ttest_ind
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load the dataset
file_path = r"C:\Users\cyase\OneDrive\Documents\Personal Projects\Gym Member Data\gym_members_exercise_tracking.csv"
gym_data = pd.read_csv(file_path)
# Add a derived BMI column if not already present
gym_data['BMI'] = gym_data['Weight (kg)'] / (gym_data['Height (m)'] ** 2)
# Display first few rows
print(gym_data.head())
### Question 1: Does workout type influence calories burned?
plt.figure(figsize=(10, 6))
sns.boxplot(data=gym_data, x='Workout_Type', y='Calories_Burned', palette='viridis')
plt.title("Calories Burned by Workout Type")
plt.xlabel("Workout Type")
plt.ylabel("Calories Burned")
plt.xticks(rotation=45)
plt.show()
# ANOVA test to check for significant differences
anova_result = f_oneway(*[gym_data[gym_data['Workout_Type'] == wt]['Calories_Burned']
for wt in gym_data['Workout_Type'].unique()])
print("ANOVA Test for Calories Burned by Workout Type:", anova_result)
### Question 2: Correlation between body fat percentage and average BPM
correlation, p_value = pearsonr(gym_data['Fat_Percentage'], gym_data['Avg_BPM'])
print(f"Correlation between Body Fat Percentage and Avg BPM: {correlation:.2f} (p={p_value:.2e})")
# Scatterplot for visualization
plt.figure(figsize=(8, 6))
sns.scatterplot(data=gym_data, x='Fat_Percentage', y='Avg_BPM', hue='Gender', palette='coolwarm')
plt.title("Relationship Between Body Fat Percentage and Avg BPM")
plt.xlabel("Body Fat Percentage")
plt.ylabel("Average BPM")
plt.show()
### Question 3: How does workout frequency vary across experience levels?
plt.figure(figsize=(10, 6))
sns.boxplot(data=gym_data, x='Experience_Level', y='Workout_Frequency (days/week)', palette='Set2')
plt.title("Workout Frequency by Experience Level")
plt.xlabel("Experience Level")
plt.ylabel("Workout Frequency (days/week)")
plt.show()
# Kruskal-Wallis Test (non-parametric equivalent to ANOVA)
kruskal_result = kruskal(*[gym_data[gym_data['Experience_Level'] == el]['Workout_Frequency (days/week)']
for el in gym_data['Experience_Level'].unique()])
print("Kruskal-Wallis Test for Workout Frequency by Experience Level:", kruskal_result)
### Question 4: What factors predict calories burned?
# Prepare data for regression
X = gym_data[['Age', 'BMI', 'Workout_Frequency (days/week)', 'Session_Duration (hours)', 'Experience_Level']]
y = gym_data['Calories_Burned']
# Add interaction terms (e.g., Age*Experience_Level, BMI*Session_Duration)
X['Age_Experience'] = gym_data['Age'] * gym_data['Experience_Level']
X['BMI_Session'] = gym_data['BMI'] * gym_data['Session_Duration (hours)']
# Encode categorical variables if necessary (e.g., Workout_Type)
X = pd.get_dummies(X, drop_first=True)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Model performance
print("R-squared:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
# Coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients.sort_values(by='Coefficient', ascending=False))
### Additional Analysis
# 1. Gender differences in calories burned
male_calories = gym_data[gym_data['Gender'] == 'Male']['Calories_Burned']
female_calories = gym_data[gym_data['Gender'] == 'Female']['Calories_Burned']
t_gender_calories = ttest_ind(male_calories, female_calories)
print(f"T-test for Calories Burned by Gender: {t_gender_calories}")
# 2. Correlation between workout frequency and fat percentage
correlation_frequency_fat, p_freq_fat = pearsonr(gym_data['Workout_Frequency (days/week)'], gym_data['Fat_Percentage'])
print(f"Correlation Between Workout Frequency and Fat Percentage: {correlation_frequency_fat:.2f} (p={p_freq_fat:.2e})")
# 3. ANOVA for Max BPM across workout types
grouped_bpm = gym_data.groupby('Workout_Type')['Max_BPM']
f_bpm, p_bpm = f_oneway(*[group for name, group in grouped_bpm])
print(f"ANOVA for Max BPM by Workout Type: F={f_bpm:.2f}, p={p_bpm:.2e}")
# 4. Comparing Avg BPM across experience levels
beginner_bpm = gym_data[gym_data['Experience_Level'] == 1]['Avg_BPM']
intermediate_bpm = gym_data[gym_data['Experience_Level'] == 2]['Avg_BPM']
expert_bpm = gym_data[gym_data['Experience_Level'] == 3]['Avg_BPM']
f_experience_bpm, p_experience_bpm = f_oneway(beginner_bpm, intermediate_bpm, expert_bpm)
print(f"ANOVA for Avg BPM by Experience Level: F={f_experience_bpm:.2f}, p={p_experience_bpm:.2e}")
# Heatmap: Correlation matrix with adjusted layout
plt.figure(figsize=(12, 8))
corr_matrix = gym_data[['Calories_Burned', 'Age', 'BMI', 'Workout_Frequency (days/week)',
'Session_Duration (hours)', 'Fat_Percentage']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
# Rotate labels for better readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Correlation Matrix', fontsize=16)
# Adjust layout to prevent cutting off labels
plt.tight_layout()
plt.show()
# 6. Boxplot: Max BPM by Workout Type
plt.figure(figsize=(10, 6))
sns.boxplot(x='Workout_Type', y='Max_BPM', data=gym_data, palette='viridis')
plt.title('Max BPM by Workout Type')
plt.xlabel('Workout Type')
plt.ylabel('Max BPM')
plt.show()
# 7. Scatterplot: Calories Burned vs. Session Duration by Experience Level
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Session_Duration (hours)', y='Calories_Burned', hue='Experience_Level', data=gym_data, palette='cool')
plt.title('Calories Burned vs. Session Duration by Experience Level')
plt.xlabel('Session Duration (hours)')
plt.ylabel('Calories Burned')
plt.show()
Real Estate Data Analysis and Visualization Pipeline
This code performs a comprehensive analysis of a real estate dataset, focusing on cleaning, exploring, and visualizing key insights. It starts by handling missing and inconsistent data, filtering invalid entries, and converting data types as needed. The analysis includes descriptive statistics, visualizations of housing prices, trends over time, and average prices by state. Advanced techniques, such as clustering based on price per acre and correlation analysis between house size and price, are employed to uncover patterns. Log transformations are applied for better visualization of skewed data, and various plots illustrate the distribution and relationships within the dataset. This pipeline provides actionable insights into housing trends and market dynamics.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
from sklearn.cluster import KMeans
# Load the data
file_path = r"C:\Users\cyase\OneDrive\Documents\Personal Projects\Real Estate\realtor-data.zip.csv"
data = pd.read_csv(file_path)
# Data Cleaning: Handle missing and incorrect data types
data['price'] = pd.to_numeric(data['price'], errors='coerce')
data['acre_lot'] = pd.to_numeric(data['acre_lot'], errors='coerce')
data['house_size'] = pd.to_numeric(data['house_size'], errors='coerce')
data['prev_sold_date'] = pd.to_datetime(data['prev_sold_date'], errors='coerce')
# Remove rows where state is "Virgin Islands"
data = data[data['state'] != 'Virgin Islands']
# Drop rows with missing critical data
data = data.dropna(subset=['price', 'acre_lot', 'house_size'])
# Avoid division by zero or near-zero values for acre_lot
data = data[data['acre_lot'] > 1e-6] # Filter out rows where acre_lot is too small
# Descriptive Statistics and Initial Exploration
print("Summary Statistics:")
print(data.describe())
# Visual 1: Distribution of Housing Prices
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], bins=50, kde=True, color='blue')
plt.title('Distribution of Housing Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Visual 2: Average Price by State
avg_price_by_state = data.groupby('state')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(14, 6)) # Increase figure size for better spacing
avg_price_by_state.plot(kind='bar', color='orange')
plt.title('Average Housing Price by State')
plt.xlabel('State')
plt.ylabel('Average Price')
# Adjust the bottom margin
plt.xticks(rotation=45, ha='right') # Rotate labels slightly for better visibility
plt.subplots_adjust(bottom=0.25) # Add more room at the bottom
plt.show()
# Advanced Analysis 1: Correlation between House Size and Price
corr, _ = pearsonr(data['house_size'], data['price'])
print(f"Correlation between House Size and Price: {corr:.2f}")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='house_size', y='price', data=data, alpha=0.6, color='green')
plt.title('House Size vs. Price')
plt.xlabel('House Size (sq ft)')
plt.ylabel('Price')
plt.show()
# Visual 3: Price by Housing Status
plt.figure(figsize=(8, 6))
sns.boxplot(x='status', y='price', data=data)
plt.title('Price Distribution by Housing Status')
plt.xlabel('Housing Status')
plt.ylabel('Price')
plt.show()
# Advanced Analysis 2: Price per Acre and Clustering
# Calculate price per acre safely
data['price_per_acre'] = data['price'] / data['acre_lot']
# Remove any resulting inf or NaN values
data = data[np.isfinite(data['price_per_acre'])] # Exclude inf and NaN
# Apply KMeans clustering on price per acre
kmeans = KMeans(n_clusters=3, random_state=42)
data['cluster'] = kmeans.fit_predict(data[['price_per_acre']])
plt.figure(figsize=(10, 6))
sns.scatterplot(x='acre_lot', y='price_per_acre', hue='cluster', palette='viridis', data=data)
plt.title('Price Per Acre Clustering')
plt.xlabel('Acre Lot')
plt.ylabel('Price per Acre')
plt.legend(title='Cluster')
plt.show()
# Advanced Analysis 3: Trends Over Time
data['year_sold'] = data['prev_sold_date'].dt.year
avg_price_by_year = data.groupby('year_sold')['price'].mean()
plt.figure(figsize=(10, 6))
avg_price_by_year.plot(marker='o', color='purple')
plt.title('Average Price Over Time')
plt.xlabel('Year Sold')
plt.ylabel('Average Price')
plt.grid()
plt.show()
# Advanced Analysis 4: Log-Transformed Scatter Plot
# Remove extreme outliers for better visualization
filtered_data = data[(data['house_size'] < 20000) & (data['price'] < 1e7)] # Example thresholds
# Apply log transformation
filtered_data['log_price'] = np.log1p(filtered_data['price'])
filtered_data['log_house_size'] = np.log1p(filtered_data['house_size'])
# Plot the transformed data
plt.figure(figsize=(10, 6))
sns.scatterplot(x='log_house_size', y='log_price', data=filtered_data, alpha=0.7, color='green')
plt.title('Log-Transformed House Size vs. Price')
plt.xlabel('Log House Size (sq ft)')
plt.ylabel('Log Price')
plt.show()
# Insights and Further Steps
print("\nKey Insights:")
print(f"- The correlation between house size and price is {corr:.2f}, indicating a strong positive relationship.")
print(f"- Housing prices are highest in {avg_price_by_state.idxmax()} on average.")
print("- The clustering analysis reveals distinct groups based on price per acre.")
print("- Historical trends suggest significant price increases over time in certain regions.")
# Filter extreme outliers for better visualization
filtered_data = data[data['price'] < 1e7] # Adjust threshold as needed
# Plot the distribution again after filtering
plt.figure(figsize=(10, 6))
sns.histplot(filtered_data['price'], bins=50, kde=True, color='blue')
plt.title('Distribution of Housing Prices (Filtered)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Log transformation for visualization
data['log_price'] = np.log1p(data['price']) # log1p handles log(0) safely
plt.figure(figsize=(10, 6))
sns.histplot(data['log_price'], bins=50, kde=True, color='green')
plt.title('Log-Transformed Distribution of Housing Prices')
plt.xlabel('Log Price')
plt.ylabel('Frequency')
plt.show()