1. Data Analysis and Predictive Modeling for Student Performance
This code performs a comprehensive analysis and predictive modeling on student performance data. It begins by loading and preparing the dataset, including filling in missing values and encoding categorical variables. The code explores the data with visualizations, such as age distribution and relationships between depression status, sleep, and academic performance. It then performs statistical analyses, including t-tests and chi-square tests, to identify significant factors affecting student outcomes. Finally, a logistic regression model is implemented to predict academic performance, and the model’s accuracy is evaluated with a confusion matrix and classification metrics. This approach offers a structured workflow for analyzing student behavior and academic outcomes based on various psychological and demographic factors.
# Load necessary libraries
library(tidyverse)
library(caret)
library(ggplot2)
library(e1071)
library(DescTools)
# Load the dataset
file_path <- "C:/Users/cyase/OneDrive/Documents/Personal Projects/CSE_student_performances.csv"
data <- read.csv(file_path)
# Show the first few rows to understand the structure of the dataset
head(data)
# Check for missing values
sum(is.na(data))
# Fill missing values if any (can be adjusted based on column type and distribution)
data <- data %>% mutate_all(~ifelse(is.na(.), lag(., default = first(.)), .))
# Basic data summary
summary(data)
# Handle categorical columns by encoding
data$Gender <- factor(data$Gender, levels = c("Male", "Female"), labels = c(1, 0))
data$DepressionStatus <- factor(data$DepressionStatus, levels = c("Yes", "No", "Sometimes"), labels = c(1, 0, 2))
data$TakingNoteInClass <- factor(data$TakingNoteInClass, levels = c("Yes", "No"), labels = c(1, 0))
data$FaceChallangesToCompleteAcademicTask <- factor(data$FaceChallangesToCompleteAcademicTask, levels = c("Yes", "No"), labels = c(1, 0))
data$LikePresentation <- factor(data$LikePresentation, levels = c("Yes", "No"), labels = c(1, 0))
data$LikeNewThings <- factor(data$LikeNewThings, levels = c("Yes", "No"), labels = c(1, 0))
# One-hot encoding for 'AcademicPerformance' (Excellent, Good, Average)
data <- data %>%
mutate(AcademicPerformance_Good = ifelse(AcademicPerformance == "Good", 1, 0),
AcademicPerformance_Excellent = ifelse(AcademicPerformance == "Excellent", 1, 0))
# Visualize the distribution of Age
ggplot(data, aes(x = Age)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
geom_density(alpha = 0.3, fill = "blue") +
ggtitle("Distribution of Age")
# Gender distribution
ggplot(data, aes(x = Gender)) +
geom_bar(fill = "skyblue", color = "black") +
ggtitle("Gender Distribution")
# DepressionStatus vs AcademicPerformance (Categorical)
ggplot(data, aes(x = factor(DepressionStatus), fill = factor(AcademicPerformance_Good))) +
geom_bar(position = "dodge") +
ggtitle("Depression Status vs Academic Performance")
# Boxplot to compare SleepPerDayHours by AcademicPerformance
ggplot(data, aes(x = factor(AcademicPerformance_Good), y = SleepPerDayHours)) +
geom_boxplot(fill = "lightgreen") +
ggtitle("Sleep Hours vs Academic Performance")
# Correlation matrix for numerical features
cor_data <- data %>% select(Age, SleepPerDayHours) # Select numerical columns
cor_matrix <- cor(cor_data, use = "complete.obs")
ggplot(as.data.frame(as.table(cor_matrix)), aes(Var1, Var2, fill = Freq)) +
geom_tile() +
scale_fill_gradient2(mid = "white", high = "blue") +
ggtitle("Correlation Matrix")
# Statistical Analysis:
# 1. Perform t-tests to compare means of Age and SleepPerDayHours between those with and without depression
depressed <- data[data$DepressionStatus == 1, ]
non_depressed <- data[data$DepressionStatus == 0, ]
# T-test for Age
t_test_age <- t.test(depressed$Age, non_depressed$Age)
print(t_test_age)
# T-test for SleepPerDayHours
t_test_sleep <- t.test(depressed$SleepPerDayHours, non_depressed$SleepPerDayHours)
print(t_test_sleep)
# 2. Perform Chi-Square test for gender and academic performance
contingency_table <- table(data$Gender, data$AcademicPerformance_Good)
chi_square_result <- chisq.test(contingency_table)
print(chi_square_result)
# Predictive Modeling (Logistic Regression to predict Academic Performance based on other features)
# Assume we are predicting 'AcademicPerformance_Good' (1 if Good, 0 if not)
model_data <- data %>% select(-AcademicPerformance) # Drop the original AcademicPerformance column
X <- model_data
y <- data$AcademicPerformance_Good
# Train-test split
set.seed(42)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]
# Logistic Regression Model
log_model <- glm(y_train ~ ., data = X_train, family = binomial)
# Predict on test data
y_pred <- predict(log_model, X_test, type = "response")
y_pred_class <- ifelse(y_pred > 0.5, 1, 0)
# Evaluate the model
accuracy <- mean(y_pred_class == y_test)
print(paste("Accuracy:", accuracy))
# Classification report (Confusion matrix, precision, recall, F1-score)
conf_matrix <- confusionMatrix(factor(y_pred_class), factor(y_test))
print(conf_matrix)
# Coefficients of the model
print(coef(log_model))
2. Analyzing User Behavior Patterns Through Device and App Usage Data
This code explores user behavior patterns in relation to device characteristics and app usage. It starts by preprocessing the dataset, including factor conversions, and then investigates the impact of device model, operating system, and app usage patterns on user behavior class through descriptive statistics, visualizations, and linear regression models. It analyzes correlations between screen-on time, data usage, and other behavioral metrics. Interaction effects and gender-based patterns are also explored to provide a nuanced understanding of behavior. Additionally, polynomial regression and K-means clustering are employed to capture complex relationships and categorize users into behavioral clusters based on app and device usage, offering insights into diverse user profiles.
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Read the dataset
data <- read.csv("C:/Users/cyase/OneDrive/Documents/Personal Projects/user_behavior_dataset.csv")
# Convert appropriate columns to factors
data$Device.Model <- as.factor(data$Device.Model)
data$Operating.System <- as.factor(data$Operating.System)
data$Gender <- as.factor(data$Gender)
data$User.Behavior.Class <- as.factor(data$User.Behavior.Class)
# Question 1: Impact of Device Characteristics on User Behavior Class
# Create summary statistics
summary_stats <- data %>%
group_by(Device.Model, Operating.System) %>%
summarise(Avg_Apps_Installed = mean(Number.of.Apps.Installed, na.rm = TRUE),
Avg_Behavior_Class = mean(as.numeric(User.Behavior.Class), na.rm = TRUE))
print(summary_stats)
# Visualize the relationship between Number of Apps Installed and User Behavior Class
ggplot(data, aes(x = Number.of.Apps.Installed, y = as.numeric(User.Behavior.Class), color = Operating.System)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Relationship Between Number of Apps Installed and User Behavior Class",
x = "Number of Apps Installed", y = "User Behavior Class")
# Perform a linear regression analysis to quantify the relationship
model <- lm(as.numeric(User.Behavior.Class) ~ Number.of.Apps.Installed + Device.Model + Operating.System, data = data)
summary(model)
# Question 2: Screen On Time and Data Usage Patterns Across User Behavior Classes
# Visualize the relationship between Screen On Time and Data Usage, colored by User Behavior Class
ggplot(data, aes(x = Screen.On.Time..hours.day., y = Data.Usage..MB.day., color = User.Behavior.Class)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Screen On Time vs. Data Usage Across User Behavior Classes",
x = "Screen On Time (hours/day)", y = "Data Usage (MB/day)")
# Calculate correlations within each User Behavior Class
correlation_by_class <- data %>%
group_by(User.Behavior.Class) %>%
summarise(Correlation = cor(Screen.On.Time..hours.day., Data.Usage..MB.day., use = "complete.obs"))
print(correlation_by_class)
# Include interaction terms in the linear model
interaction_model <- lm(as.numeric(User.Behavior.Class) ~ Number.of.Apps.Installed * Operating.System * Device.Model, data = data)
summary(interaction_model)
# Visualize interaction between Number of Apps Installed and Operating System
ggplot(data, aes(x = Number.of.Apps.Installed, y = as.numeric(User.Behavior.Class), color = Operating.System)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ Device.Model) +
labs(title = "Interaction of Number of Apps Installed and Operating System on User Behavior Class",
x = "Number of Apps Installed", y = "User Behavior Class")
# Visualize relationship between Number of Apps Installed and User Behavior Class by Gender
ggplot(data, aes(x = Number.of.Apps.Installed, y = as.numeric(User.Behavior.Class), color = Gender)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Number of Apps Installed vs. User Behavior Class by Gender",
x = "Number of Apps Installed", y = "User Behavior Class")
# Perform linear regression including Gender as a predictor
gender_model <- lm(as.numeric(User.Behavior.Class) ~ Number.of.Apps.Installed + Device.Model + Operating.System + Gender, data = data)
summary(gender_model)
# Visualize relationship between App Usage Time and User Behavior Class
ggplot(data, aes(x = App.Usage.Time..min.day., y = as.numeric(User.Behavior.Class), color = Operating.System)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "App Usage Time vs. User Behavior Class",
x = "App Usage Time (min/day)", y = "User Behavior Class")
# Correlation analysis between App Usage Time and User Behavior Class
correlation_app_usage <- cor(data$App.Usage.Time..min.day., as.numeric(data$User.Behavior.Class), use = "complete.obs")
print(correlation_app_usage)
# Linear regression with App Usage Time as a predictor
app_usage_model <- lm(as.numeric(User.Behavior.Class) ~ App.Usage.Time..min.day. + Device.Model + Operating.System, data = data)
summary(app_usage_model)
# Fit a polynomial regression model for Number of Apps Installed
poly_model <- lm(as.numeric(User.Behavior.Class) ~ poly(Number.of.Apps.Installed, 2) + Device.Model + Operating.System, data = data)
summary(poly_model)
# Visualize the polynomial regression
ggplot(data, aes(x = Number.of.Apps.Installed, y = as.numeric(User.Behavior.Class), color = Operating.System)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
labs(title = "Polynomial Relationship Between Number of Apps Installed and User Behavior Class",
x = "Number of Apps Installed", y = "User Behavior Class")
# Prepare data for clustering by selecting relevant numeric columns
clustering_data <- data %>%
select(Number.of.Apps.Installed, Screen.On.Time..hours.day., Data.Usage..MB.day., App.Usage.Time..min.day.)
# Standardize the data for clustering
clustering_data_scaled <- scale(clustering_data)
# Perform K-means clustering with 3 clusters (you can experiment with different numbers)
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = 3, nstart = 20)
# Add cluster results to the original dataset
data$Cluster <- as.factor(kmeans_result$cluster)
# Visualize clusters with Number of Apps Installed and Data Usage
ggplot(data, aes(x = Number.of.Apps.Installed, y = Data.Usage..MB.day., color = Cluster)) +
geom_point() +
labs(title = "K-means Clustering Based on App and Device Usage",
x = "Number of Apps Installed", y = "Data Usage (MB/day)")
3. Comparative Modeling of Loan Acceptance Using Logistic, Probit, and Neural Network Models
This code investigates the likelihood of personal loan acceptance by using logistic and probit regression models, both with and without interaction terms, and a neural network model for enhanced prediction accuracy. The initial sections fit and summarize both the logit and probit models, examining how variables like income, family size, credit card usage, and account types impact loan acceptance probability. Interaction terms are added to capture potential combined effects, and predictions are plotted to visualize sensitivity to income changes. A neural network model with five hidden units further explores these relationships, creating predictions for loan acceptance based on income, family size, and account information. Finally, sensitivity analysis plots illustrate model-predicted probabilities against income, allowing a detailed comparison between models.
library(ggplot2)
library(nnet)
library(dplyr)
# Logit model: Logistic Regression
model_logit <- glm(PersonalLoan ~ Income + Family + CCAvg + Education +
SecuritiesAccount + CDAccount + Online + CreditCard,
data = data, family = binomial(link = "logit"))
# Display summary of the logit model
summary(model_logit)
# Probit model: Probit Regression
model_probit <- glm(PersonalLoan ~ Income + Family + CCAvg + Education +
SecuritiesAccount + CDAccount + Online + CreditCard,
data = data, family = binomial(link = "probit"))
# Display summary of the probit model
summary(model_probit)
--2
# Logit model with interaction terms
model_logit_interaction <- glm(PersonalLoan ~ Income + Family + CCAvg + Education +
SecuritiesAccount + CDAccount + Online + CreditCard +
Income:CCAvg + Education:Family,
data = data, family = binomial(link = "logit"))
# Display summary of the logit model with interactions
summary(model_logit_interaction)
# Probit model with interaction terms
model_probit_interaction <- glm(PersonalLoan ~ Income + Family + CCAvg + Education +
SecuritiesAccount + CDAccount + Online + CreditCard +
Income:CCAvg + Education:Family,
data = data, family = binomial(link = "probit"))
# Display summary of the probit model with interactions
summary(model_probit_interaction)
--3
# Example prediction: Logit model with interaction terms
new_data <- data.frame(Income = seq(min(data$Income), max(data$Income), length.out = 100),
Family = mean(data$Family, na.rm = TRUE),
CCAvg = seq(min(data$CCAvg), max(data$CCAvg), length.out = 100),
Education = factor(1, levels = c(1, 2, 3)),
SecuritiesAccount = factor(0, levels = c(0, 1)),
CDAccount = factor(0, levels = c(0, 1)),
Online = factor(0, levels = c(0, 1)),
CreditCard = factor(0, levels = c(0, 1)))
# Add interaction terms
new_data$Income_CCAvg <- new_data$Income * new_data$CCAvg
new_data$Education_Family <- as.numeric(new_data$Education) * new_data$Family
# Predict probabilities
new_data$logit_pred_interaction <- predict(model_logit_interaction, newdata = new_data, type = "response")
new_data$probit_pred_interaction <- predict(model_probit_interaction, newdata = new_data, type = "response")
# Sensitivity Analysis Plot
ggplot(new_data, aes(x = Income)) +
geom_line(aes(y = logit_pred_interaction, color = "Logit (with Interaction)")) +
geom_line(aes(y = probit_pred_interaction, color = "Probit (with Interaction)")) +
labs(title = "Predicted Probability of Taking a Loan by Income (with Interactions)",
x = "Income ($000)",
y = "Predicted Probability") +
theme_minimal() +
scale_color_manual(name = "Model", values = c("Logit (with Interaction)" = "blue", "Probit (with Interaction)" = "red"))
--4
# Load necessary library for neural network
library(nnet)
# Prepare data for neural network
data_nn <- data %>%
mutate(PersonalLoan = as.numeric(PersonalLoan) - 1) # Convert PersonalLoan to binary numeric
# Fit the neural network model
nn_model <- nnet(PersonalLoan ~ Income + Family + CCAvg + Education +
SecuritiesAccount + CDAccount + Online + CreditCard,
data = data_nn, size = 5, decay = 0.1, maxit = 200)
# Display summary of the neural network model
print(nn_model)
# Make predictions using the neural network model
nn_predictions <- predict(nn_model, newdata = data_nn, type = "raw")
# Combine predictions with the original data for plotting
data_nn$nn_predictions <- nn_predictions
# Sensitivity Analysis: Example plot of neural network predictions against Income
ggplot(data_nn, aes(x = Income, y = nn_predictions)) +
geom_line(color = "blue") +
labs(title = "Neural Network Predictions by Income",
x = "Income ($000)",
y = "Predicted Probability") +
theme_minimal()
4. Comparative Analysis of Plant Growth Treatments Using Traditional and Bayesian Methods
This code explores the effects of different treatments on plant growth by comparing the weights of plants in control and treatment groups within the PlantGrowth dataset. First, a traditional t-test is conducted between the control (ctrl) and two treatment groups (trt1 and trt2), providing a classical statistical comparison. For a more robust analysis, Bayesian Estimation Supersedes the t-test (BEST) with Markov Chain Monte Carlo (MCMC) is used to estimate the posterior distributions of the difference in means between these groups. The Bayesian approach provides credible intervals and visualizations, allowing a deeper interpretation of treatment effects on plant weight by plotting the posterior distributions for both comparisons (ctrl vs. trt1 and ctrl vs. trt2).
# Load the PlantGrowth dataset
data("PlantGrowth")
# Perform a t-test between the ctrl and trt1 groups
t_test_result <- t.test(weight ~ group, data = subset(PlantGrowth, group %in% c("ctrl", "trt1")))
# Display the results
print(t_test_result)
---------------
install.packages("rjags")
library(rjags)
install.packages("HDInterval") # Note you may have to install HDInterval first if you have a Mac.
library(HDInterval)
install.packages("https://cran.r-project.org/src/contrib/Archive/BEST/BEST_0.5.4.tar.gz", repos = NULL, type = "source")
library(BEST)
# Load the PlantGrowth dataset
data("PlantGrowth")
# Subset the data for the ctrl and trt1 groups
ctrl_data <- PlantGrowth$weight[PlantGrowth$group == "ctrl"]
trt1_data <- PlantGrowth$weight[PlantGrowth$group == "trt1"]
# Run BEST MCMC analysis
best_result <- BESTmcmc(ctrl_data, trt1_data)
# Plot the results
plot(best_result)
------------
data_trt2 <- subset(PlantGrowth, group %in% c("ctrl", "trt2"))
# Run Bayesian analysis to compare ctrl and trt2
result_trt2_ctrl <- BESTmcmc(
y1 = data_trt2$weight[data_trt2$group == "ctrl"],
y2 = data_trt2$weight[data_trt2$group == "trt2"]
)
# Plot the posterior distribution of the difference in means
plot(result_trt2_ctrl)
t_test_result <- t.test(weight ~ group, data = subset(PlantGrowth, group %in% c("ctrl", "trt2")))
print(t_test_result)