Chapter 2: Business Analytics Foundations
Learn to import datasets, explore data structures, and understand basic data operations.
Practice exercise from Chapter 2: Business Analytics Foundations
Best Score: -
Not started
Start Exercise
Solution Code:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target
print(X)
print(y)
Practice exercise from Chapter 2: Business Analytics Foundations
Best Score: -
Not started
Start Exercise
Solution Code:
columns = california_housing.feature_names
print(columns)
target = california_housing.target_names
print(target)
Practice exercise from Chapter 2: Business Analytics Foundations
Best Score: -
Not started
Start Exercise
Solution Code:
import pandas as pd
california_df = pd.DataFrame(data = X, columns= columns)
california_df.head(10)
california_df[&
california_df[&
california_df[&
california_df.head(10)
Practice exercise from Chapter 2: Business Analytics Foundations
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.shape
Practice exercise from Chapter 2: Business Analytics Foundations
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.dtypes
california_df[&
california_df[&
california_df.dtypes
Chapter 4: Data Collection and Preparation
Master data cleaning techniques including handling missing values, outliers, standardization, and normalization.
Practice exercise from Chapter 4: Data Collection and Preparation
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.isnull()
california_df.isnull().sum()
california_df.isna()
california_df.isna().sum()
import numpy as np
def introduce_missing_values(df, fraction_missing, random_state=None):
np.random.seed(random_state)
mask = np.random.rand(*df.shape) < fraction_missing
df_with_missing = df.mask(mask)
return df_with_missing
df_with_missing = introduce_missing_values(california_df, 0.2, random_state=42)
print("Original DataFrame:")
print(california_df.head())
print("\nDataFrame with random missing values:")
print(df_with_missing.head())
df_with_missing.head(10)
df_with_missing.isna().sum()
import pandas as pd
import matplotlib.pyplot as plt
missing_proportion = df_with_missing.isnull().mean()
total_rows = len(df_with_missing)
missing_count = df_with_missing.isnull().sum()
missing_count_df = pd.DataFrame({&
plt.figure(figsize=(10, 6))
plt.bar(missing_proportion.index, total_rows, color=&
plt.bar(missing_proportion.index, missing_count_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(rotation=45)
plt.legend([&
plt.tight_layout()
plt.show()
df_with_missing.dropna(subset=[&
print(df_with_missing.isna().sum())
print(df_with_missing.shape)
df_with_missing.drop(&
print(df_with_missing.isna().sum())
print(df_with_missing.head(10))
lat_mode = df_with_missing[&
print(lat_mode)
df_with_missing[&
print(df_with_missing.isna().sum())
df_with_missing[&
print(df_with_missing[&
print(df_with_missing[&
print(df_with_missing[&
mean_houseage = df_with_missing[&
print(mean_houseage)
df_with_missing[&
print(df_with_missing.isna().sum())
print(df_with_missing[&
df_with_missing[&
Practice exercise from Chapter 4: Data Collection and Preparation
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.boxplot()
plt.xticks(rotation=45, ha=&
plt.show()
def count_outliers(column):
if pd.api.types.is_numeric_dtype(column):
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = column[(column < lower_bound) | (column > upper_bound)]
return outliers.shape[0]
else:
return 0
numeric_columns = california_df.select_dtypes(include=[&
outliers_count = numeric_columns.apply(count_outliers)
print("Number of outliers in each numeric column:")
print(outliers_count)
def remove_outliers(column):
Q1 = column.quantile(0.25)
Q3 = column.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
column = column[(column >= lower_bound) & (column <= upper_bound)]
return column
california_df_clean = california_df.copy()
for column in california_df_clean.select_dtypes(include=[&
california_df_clean[column] = remove_outliers(california_df_clean[column])
california_df_clean = california_df_clean.dropna()
print("
DataFrame after removing outliers:")
print(california_df_clean)
import matplotlib.pyplot as plt
california_df_clean.boxplot(figsize=(10, 6))
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
california_df_copy = california_df.copy()
california_df_copy.boxplot()
plt.xticks(rotation=45, ha=&
plt.show()
california_df_copy = california_df_copy[california_df_copy[&
california_df_copy.boxplot()
plt.xticks(rotation=45, ha=&
plt.show()
california_df_copy.shape
Practice exercise from Chapter 4: Data Collection and Preparation
Best Score: -
Not started
Start Exercise
Solution Code:
import pandas as pd
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = california_df.select_dtypes(include=&
california_df_standardized = scaler.fit_transform(numeric_columns)
california_df_standardized = pd.DataFrame(california_df_standardized, columns=numeric_columns.columns)
print("Standardized California DataFrame:")
print(california_df_standardized)
california_df_standardized.boxplot()
plt.xticks(rotation=45, ha=&
plt.show()
Practice exercise from Chapter 4: Data Collection and Preparation
Best Score: -
Not started
Start Exercise
Solution Code:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_columns = california_df.select_dtypes(include=&
california_df_scaled = scaler.fit_transform(numeric_columns)
california_df_scaled = pd.DataFrame(california_df_scaled, columns=numeric_columns.columns)
print("Scaled California DataFrame:")
print(california_df_scaled)
california_df_scaled.boxplot()
plt.xticks(rotation=45, ha=&
plt.show()
X = california_df_scaled.drop([&
y = california_df_scaled[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)
Chapter 7: Data Visualization
Create compelling visualizations including histograms, boxplots, scatterplots, correlation matrices, and multivariate plots.
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
california_df[&
import matplotlib.pyplot as plt
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df.hist(figsize=(15, 10), color=&
plt.suptitle(&
plt.show()
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.boxplot(&
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.boxplot(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
numeric_columns = california_df.select_dtypes(include=[&
plt.figure(figsize=(12, 8))
plt.boxplot(california_df[numeric_columns].values, patch_artist=True)
plt.xticks(range(1, len(numeric_columns) + 1), numeric_columns, rotation=45)
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.tight_layout()
plt.show()
latitude_values = california_df[&
latitude_values.head(10)
latitude_values.shape
top_50_categories = latitude_values.head(50)
plt.figure(figsize=(20, 6))
top_50_categories.plot(kind=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(rotation=45)
plt.show()
import matplotlib.pyplot as plt
top_50_categories = california_df[&
plt.figure(figsize=(20, 6))
plt.bar(range(len(top_50_categories)), top_50_categories.values, color=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(range(len(top_50_categories)), top_50_categories.index, rotation=45)
plt.tight_layout()
plt.show()
top_50_latitude = california_df[&
top_50_longitude = california_df[&
plt.figure(figsize=(20, 6))
plt.subplot(1, 2, 1)
plt.bar(range(len(top_50_latitude)), top_50_latitude.values, color=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(range(len(top_50_latitude)), top_50_latitude.index, rotation=45)
plt.subplot(1, 2, 2)
plt.bar(range(len(top_50_longitude)), top_50_longitude.values, color=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(range(len(top_50_longitude)), top_50_longitude.index, rotation=45)
plt.tight_layout()
plt.show()
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.plot.scatter(x=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
california_df.dtypes
plt.figure(figsize=(10, 6))
plt.scatter(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(california_df[&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
california_df.corr(numeric_only=True)
import seaborn as sns
correlation_matrix = california_df.corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap=&
plt.title(&
plt.show()
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
sns.pairplot(california_df)
plt.show()
Practice exercise from Chapter 7: Data Visualization
Best Score: -
Not started
Start Exercise
Solution Code:
import matplotlib.pyplot as plt
import numpy as np
houseage_min = california_df[&
houseage_max = california_df[&
normalized_houseage = (california_df[&
top_70_categories = california_df.groupby(&
plt.figure(figsize=(20, 6))
bars = top_70_categories.plot(kind=&
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap=&
cbar.set_label(&
legend_labels = [f&
cbar.set_ticks([0, 1])
cbar.set_ticklabels(legend_labels)
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.xticks(rotation=45)
plt.show()
import matplotlib.pyplot as plt
scatter = plt.scatter(x=&
plt.colorbar(scatter, label=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
import matplotlib.pyplot as plt
scatter = plt.scatter(x=&
plt.colorbar(scatter, label=&
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
Chapter 8: Statistical Analysis
Perform statistical analysis and build regression models to predict housing prices.
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
california_df[&
california_df[&
california_df[&
california_df.describe(include=&
california_df.describe(include=&
california_df.describe(include=&
california_df.describe()
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X = california_df[&
y = california_df[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train_reshaped = X_train.values.reshape(-1, 1)
X_test_reshaped = X_test.values.reshape(-1, 1)
model = LinearRegression()
model.fit(X_train_reshaped, y_train)
model.intercept_
model.coef_
y_pred = model.predict(X_test_reshaped)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
X = california_df.drop([&
y = california_df[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
model.intercept_
model.coef_
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
coef_names = california_df.drop(columns=[&
for name, coef in zip(coef_names, model.coef_):
print(f"{name}: {coef}")
print("Intercept (model intercept):", model.intercept_)
equation = "y = "
for name, coef in zip(coef_names, model.coef_):
equation += f"{coef:.4f} * {name} + "
equation += f"{model.intercept_:.4f}"
print("Regression equation:", equation)
sorted_coefs = sorted(zip(coef_names, model.coef_), key=lambda x: abs(x[1]), reverse=True)
for name, coef in sorted_coefs:
print(f"{name}: {coef:.4f}")
import statsmodels.api as sm
X = sm.add_constant(california_df.drop(columns=[&
model = sm.OLS(california_df[&
p_values = model.pvalues
print("P-values for the coefficients:")
print(p_values)
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
residuals = y_test - y_pred
residuals.hist()
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
plt.figure(figsize=(8, 6))
plt.boxplot(residuals)
plt.title(&
plt.xlabel(&
plt.ylabel(&
plt.show()
residuals.describe()
Practice exercise from Chapter 8: Statistical Analysis
Best Score: -
Not started
Start Exercise
Solution Code:
X = california_df_standardized.drop([&
y = california_df_standardized[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)
X = california_df.drop([&
y = california_df[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)
X = california_df_scaled.drop([&
y = california_df_scaled[&
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)