import streamlit as st #import for webapp
import pandas as pd #pandas for data managing in python
import numpy as np #we need numpy for tensor manipulation
import matplotlib.pyplot as plt # we will plot some figures
import matplotlib.cm as cm # with nice colors
import plotly.express as px # and also 3-d charts
import hana_ml.dataframe as dataframe #main dataframe with SAP HANA
from hana_ml.algorithms.pal.clustering import KMeans # our choice of clustering algo
#Title info. st.beta_set_page_config brings us the possibility to set the page title and icon and to define our sidebar and layout.
st.beta_set_page_config(
page_title="Cluster App",
page_icon="https://cdn.appythings.nl/wp-content/uploads/2018/06/SAP-logo-icon-PNG-Transparent-Background.png ",
layout="centered",
initial_sidebar_state="expanded",
)
# Reading data
import hana_ml.dataframe as dataframe
conn = dataframe.ConnectionContext(address = '',
port = 0,
user = '',
password = '',
encrypt = 'true',
sslValidateCertificate = 'false')
df = conn.table(table = 'MALL_CUSTOMERS').sort('CUSTOMER_ID', desc = False)
# Column selector for clustering
cluster_cols = st.multiselect(
"Specify columns for clustering",
['AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'],
default=['ANNUAL_INCOME', 'SPENDING_SCORE'],
)
X = df[cluster_cols+['CUSTOMER_ID']]
# Cluster count
n_clusters = st.slider('How many clusters?' , 2 ,10, 2)
# Cluster
pal_kmeans = KMeans(n_clusters = n_clusters)
labels = pal_kmeans.fit_predict(data = df, key = 'CUSTOMER_ID')
# Visualization
selected_x = st.selectbox('Select x column:', ('AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'))
selected_y = st.selectbox('Select y column:', ('AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'))
st.write(selected_x)
plt.style.context('seaborn-whitegrid')
plt.scatter(df[[selected_x]].collect()[selected_x],
df[[selected_y]].collect()[selected_y],
c=labels[['CLUSTER_ID']].collect()['CLUSTER_ID'])
plt.xlabel(selected_x)
plt.ylabel(selected_y)
st.pyplot()
# 3d visualization
fig = px.scatter_3d(df.collect(), x='ANNUAL_INCOME', y='SPENDING_SCORE', z='AGE',
color=labels[['CLUSTER_ID']].collect()['CLUSTER_ID'])
st.plotly_chart(fig, use_container_width=True)
# Sidebar elbow
st.sidebar.subheader('Show elbow of all clusters:')
show_elbow = st.sidebar.checkbox('Calculate elbow?')
if show_elbow:
res=[]
num_cluster_ranges=range(2,10)
for cluster_count in num_cluster_ranges:
pal_kmeans = KMeans(n_clusters = cluster_count)
df_labels = pal_kmeans.fit_predict(data = df, key = 'CUSTOMER_ID')
df_labels = df_labels.select('*', ('DISTANCE*DISTANCE', 'DISTANCE2')) #squere euclidian distance
count_by_cluster = df_labels.agg([('avg','DISTANCE2','DISTANCE2')])\
.collect()
res.append(count_by_cluster['DISTANCE2'].values[0])
plt.plot(num_cluster_ranges,res,marker='*')
st.sidebar.pyplot()
# Sidebar silhouette
st.sidebar.subheader('Silhouette:')
show_silhouette = st.sidebar.checkbox('Show silhouette?')
if show_silhouette:
avg_silh_by_cluster = labels.agg([('avg','SLIGHT_SILHOUETTE','SLIGHT_SILHOUETTE')],\
group_by='CLUSTER_ID').collect()
silhouette_avg = labels.agg([('avg','SLIGHT_SILHOUETTE','SLIGHT_SILHOUETTE')]).collect().values[0][0]
n_clusters=len(avg_silh_by_cluster)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = labels.filter(f'CLUSTER_ID={i}')[['SLIGHT_SILHOUETTE']]\
.collect()['SLIGHT_SILHOUETTE'].values
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
plt.title(f'AVG silhouette - {silhouette_avg}')
plt.vlines(silhouette_avg,y_lower,y_upper,color='red',linestyles='--')
color = cm.nipy_spectral(float(i) / n_clusters)
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
y_lower = y_upper + 10
st.sidebar.pyplot()
st.write('Silhouette avg score = ',silhouette_avg)
# Raw data viz
st.subheader('Data:')
dfn = df.collect()
dfn['Cluster']=labels[['CLUSTER_ID']].collect()['CLUSTER_ID']
highlight_cl = st.checkbox('Highlight clusters',False)
def highlight(s):
color = cm.nipy_spectral(float(s.Cluster) / n_clusters)
c1 = cm.colors.to_rgba_array(color)
c1[0][3]=0.2
return [f'background-color: {cm.colors.rgb2hex(c1.flatten(),keep_alpha=True)}']*len(dfn.columns)
if highlight_cl:
st.table(dfn.style.apply(highlight, axis=1))
else:
st.table(dfn)
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import plotly.express as px
import hana_ml.dataframe as dataframe
from hana_ml.algorithms.pal.clustering import KMeans
#Title info
st.beta_set_page_config(
page_title="Cluster App",
page_icon="https://cdn.appythings.nl/wp-content/uploads/2018/06/SAP-logo-icon-PNG-Transparent-Background.png",
layout="centered",#centered
initial_sidebar_state="expanded",
)
st.title('Let\'s clustering with SAP HANA PAL...')
#Reading data
conn = dataframe.ConnectionContext(address = 'e1b18bd9-abcc-4a3a-89b0-65b3a7bf1eed.hana.canary-eu10.hanacloud.ondemand.com',
port = 443,
user = '<USERNAME>',
password = '<PASS>',
encrypt = 'true',
sslValidateCertificate='false'
)
df = conn.table(table = 'MALL_CUSTOMERS', schema = 'MLUSER').sort('CUSTOMER_ID', desc = False)
#Columns selector for clustering
cluster_cols = st.multiselect(
"Which columns select for clustering",
['AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'],
default=['ANNUAL_INCOME', 'SPENDING_SCORE'],
)
X = df[cluster_cols+['CUSTOMER_ID']]
#Clustering
n_clusters = st.slider('How many clusters?',2,10,2)
pal_kmeans = KMeans(n_clusters = n_clusters)
labels = pal_kmeans.fit_predict(data = df, key = 'CUSTOMER_ID')
# st.table(labels.collect())
# Viz
selected_x = st.selectbox('Select x_column:', ('AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'))
selected_y = st.selectbox('Select y_column:', ('AGE', 'ANNUAL_INCOME', 'SPENDING_SCORE'))
st.write(selected_x)
plt.style.context('seaborn-whitegrid')
plt.scatter(df[[selected_x]].collect()[selected_x],
df[[selected_y]].collect()[selected_y],
c=labels[['CLUSTER_ID']].collect()['CLUSTER_ID'])
plt.xlabel(selected_x)
plt.ylabel(selected_y)
st.pyplot()
# 3d chart viz
fig = px.scatter_3d(df.collect(), x='ANNUAL_INCOME', y='SPENDING_SCORE', z='AGE',
color=labels[['CLUSTER_ID']].collect()['CLUSTER_ID'])
st.plotly_chart(fig, use_container_width=True)
# #Sidebar elbow
st.sidebar.subheader('Show elbow of all clusters:')
show_elbow = st.sidebar.checkbox('Calculate elbow?')
if show_elbow:
res=[]
num_cluster_ranges=range(2,10)
for cluster_count in num_cluster_ranges:
pal_kmeans = KMeans(n_clusters = cluster_count)
df_labels = pal_kmeans.fit_predict(data = df, key = 'CUSTOMER_ID')
df_labels = df_labels.select('*', ('DISTANCE*DISTANCE', 'DISTANCE2')) #squere euclidian distance
count_by_cluster = df_labels.agg([('avg','DISTANCE2','DISTANCE2')])\
.collect()
res.append(count_by_cluster['DISTANCE2'].values[0])
plt.plot(num_cluster_ranges,res,marker='*')
st.sidebar.pyplot()
st.sidebar.subheader('Silhouette:')
show_silhouette = st.sidebar.checkbox('Show silhouette?')
#Sidebar silhouette
if show_silhouette:
avg_silh_by_cluster = labels.agg([('avg','SLIGHT_SILHOUETTE','SLIGHT_SILHOUETTE')],\
group_by='CLUSTER_ID').collect()
silhouette_avg = labels.agg([('avg','SLIGHT_SILHOUETTE','SLIGHT_SILHOUETTE')]).collect().values[0][0]
n_clusters=len(avg_silh_by_cluster)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = labels.filter(f'CLUSTER_ID={i}')[['SLIGHT_SILHOUETTE']]\
.collect()['SLIGHT_SILHOUETTE'].values
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
plt.title(f'AVG silhouette - {silhouette_avg}')
plt.vlines(silhouette_avg,y_lower,y_upper,color='red',linestyles='--')
color = cm.nipy_spectral(float(i) / n_clusters)
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
y_lower = y_upper + 10
st.sidebar.pyplot()
st.write('Silhouette avg score = ',silhouette_avg)
#Row data viz
st.subheader('Data:')
dfn = df.collect()
dfn['Cluster']=labels[['CLUSTER_ID']].collect()['CLUSTER_ID']
highlight_cl = st.checkbox('Highlight cluters',False)
def highlight(s):
color = cm.nipy_spectral(float(s.Cluster) / n_clusters)
c1 = cm.colors.to_rgba_array(color)
c1[0][3]=0.2
return [f'background-color: {cm.colors.rgb2hex(c1.flatten(),keep_alpha=True)}']*len(dfn.columns)
if highlight_cl:
st.table(dfn.style.apply(highlight, axis=1))
else:
st.table(dfn)
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
13 | |
11 | |
11 | |
10 | |
9 | |
7 | |
6 | |
5 | |
5 | |
5 |