How To use Jupyter Notebook As Hugo Blog Post
2 minute read
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from databricks.sdk.runtime import *
from databricks.connect import DatabricksSession
spark = DatabricksSession.builder.profile("dbrx_connect").getOrCreate()
/Users/ryan/Developer/dbrx_ml_eng_cases/.venv/lib/python3.11/site-packages/databricks/connect/session.py:433: UserWarning: Ignoring the default notebook Spark session and creating a new Spark Connect session. To use the default notebook Spark session, use DatabricksSession.builder.getOrCreate() with no additional parameters.
warnings.warn(new_notebook_session_msg)
dataset_path = "/Volumes/dt_analytics/analytics_volumes/rhammang/telco"
schema_string = """
customerID string,
gender string,
SeniorCitizen double,
Partner string,
Dependents string,
tenure double,
phoneService string,
MultipleLines string,
internetService string,
OnlineSecurity string,
OnlineBackup string,
DeviceProtection string,
TechSupport string,
StreamingTV string,
StreamingMovies string,
Contract string,
PaperlessBilling string,
PaymentMethod string,
MonthlyCharges double,
TotalCharges double,
Churn string
"""
telco_df = spark.read.csv(dataset_path,
header=True,
schema=schema_string,
multiLine=True,
escape='"')
# telco_df.show(n=8, truncate=False)
# telco_df.summary().show(truncate=False)
d = dbutils.fs.ls('/')
# dbutils.data.summarize(telco_df)
telco_pdf = telco_df.toPandas()
# Select columns that are the numerical columns
selected_columns = ['tenure', 'TotalCharges', 'MonthlyCharges']
# Select the specified columns from the DataFrame
telco_corr = telco_pdf[selected_columns].corr()
# Create a heatmap using seaborn
plt.figure(figsize=(10, 6))
sns.heatmap(telco_corr, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Telco Dataset')
plt.show()
# Select columns that are the numerical columns
selected_columns = ['tenure', 'TotalCharges', 'MonthlyCharges']
# Select the specified columns from the DataFrame
telco_ppdf = telco_pdf[selected_columns + ['Churn']]
# Pairplot for a quick overview of relationships between numerical variables
sns.pairplot(telco_ppdf, hue='Churn', diag_kind='kde')
plt.suptitle('Pairplot for Telco Dataset', y=1.02)
plt.show()
# Boxplot for visualizing the distribution of Monthly Charges by Churn
plt.figure(figsize=(10, 6))
sns.boxplot(x='Churn', y='MonthlyCharges', data=telco_pdf)
plt.title('Monthly Charges Distribution by Churn Status')
plt.show()
telco_df.groupBy("PaymentMethod").count().orderBy("count", ascending=False).show(n=8, truncate=False)
+-------------------------+-----+
|PaymentMethod |count|
+-------------------------+-----+
|Electronic check |2365 |
|Mailed check |1612 |
|Bank transfer (automatic)|1544 |
|Credit card (automatic) |1522 |
+-------------------------+-----+