# dependencies and setup
from package.constants import * # constants
from package.helpers import * # libraries, dependencies and functions

☑ constants is imporetd
☑ helpers is imporetd


# load the data into a Pandas DataFrame
df_market_data = pd.read_csv(DATA_URL+"crypto_market_data.csv", index_col="coin_id")

# display sample data
df_market_data.head()


# generate the df info
df_market_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41 entries, bitcoin to digibyte
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   price_change_percentage_24h   41 non-null     float64
 1   price_change_percentage_7d    41 non-null     float64
 2   price_change_percentage_14d   41 non-null     float64
 3   price_change_percentage_30d   41 non-null     float64
 4   price_change_percentage_60d   41 non-null     float64
 5   price_change_percentage_200d  41 non-null     float64
 6   price_change_percentage_1y    41 non-null     float64
dtypes: float64(7)
memory usage: 2.6+ KB


# generate the summary statistics
df_market_data.describe(include = 'all').round(2)


# plotting data
line(df_market_data,"Price Change Over Time")


# use the StandardScaler() module to normalize the data from the CSV file
data_scaled = StandardScaler().fit_transform(df_market_data)

# create a df for the scaled data and set the coinid column as index
df_market_scaled = pd.DataFrame(data_scaled, columns=df_market_data.columns, index=df_market_data.index)

# display sample data
df_market_scaled.head()


# generate the summary statistics for scaled data
df_market_scaled.describe(include = 'all').round(2)


# plotting scaled data
line(df_market_scaled,"Standardized Price Change Over Time")


# determine the optimal value of k by "clusters_methods" function in the "helpers" package located at "./src/package/helpers" is 
cluster_results, optimal_ks = clusters_methods(df_market_scaled, ["wcss_elbow", "silhouette", "calinski_harabasz"])

# plotting methods by "score_plot" function in the "helpers" package located at "./src/package/helpers"
score_plot(cluster_results, optimal_ks)


# clustering the optimal value of k=3
plot_3, prediction_3=scatter_cluster(3, df_market_scaled, ["price_change_percentage_24h", "price_change_percentage_7d"])


# clustering the optimal value of k=4 
plot_4, prediction_4=scatter_cluster(4, df_market_scaled, ["price_change_percentage_24h", "price_change_percentage_7d"])


# clustering the optimal value of k=5 
plot_5, prediction_5=scatter_cluster(5, df_market_scaled, ["price_change_percentage_24h", "price_change_percentage_7d"])


# create copy of df_market_scaled and print pridiction results for each k
df_copy = df_market_scaled.copy()
df_copy = df_copy[["price_change_percentage_24h", "price_change_percentage_7d"]]

# add a new column to the DataFrame with the predicted clusters
predictions = [prediction_3, prediction_4, prediction_5]
for i,prediction in enumerate(predictions):      
    df_copy["k_"+str(i+3)] = predictions[i]

# print df_copy head
df_copy


# Create column names for the PCA components
pca_columns = ['PCA{}'.format(i) for i in range(1, 4)]

# Perform PCA on the scaled data and create a DataFrame with the results
pca = PCA(n_components=3)
market_pca = pca.fit_transform(df_market_scaled)
market_pca_df = pd.DataFrame(market_pca, columns=pca_columns, index=df_market_data.index)

# Display the first few rows of the DataFrame
market_pca_df.head()


# retrieve the explained variance to determine how much information 
print(f"pca explained for first 3 components out of 7: {[f'{v:.2f}' for v in pca.explained_variance_ratio_[:3]]}")

# total explained variance with 3 components
print(f"total explained variance with 3 components: {sum(pca.explained_variance_ratio_[:3]):.2f}")

pca explained for first 3 components out of 7: ['0.37', '0.35', '0.18']
total explained variance with 3 components: 0.90


# determine the optimal value of k by "clusters_methods" function in the "helpers" package located at "./src/package/helpers" is 
cluster_results_pca, optimal_ks_pca = clusters_methods(market_pca_df, ["wcss_elbow", "silhouette", "calinski_harabasz"])

# plotting methods by "score_plot" function in the "helpers" package located at "./src/package/helpers"
score_plot(cluster_results_pca, optimal_ks_pca)


# clustering the optimal value of k=3
pca_plot_3, pca_prediction_3 = scatter_3d_cluster(3, market_pca_df, ['PCA1', 'PCA2', 'PCA3'])


# clustering the optimal value of k=4
pca_plot_4, pca_prediction_4 = scatter_3d_cluster(4, market_pca_df, ['PCA1', 'PCA2', 'PCA3'])


# create copy of market_pca_df and print pridiction results for each k
df_pca_copy = market_pca_df.copy()

# add a new column to the DataFrame with the predicted clusters
predictions = [pca_prediction_3, pca_prediction_4]
for i,prediction in enumerate(predictions):      
    df_pca_copy["k_"+str(i+3)] = predictions[i]

# display sample data
df_pca_copy

	price_change_percentage_24h	price_change_percentage_7d	price_change_percentage_14d	price_change_percentage_30d	price_change_percentage_60d	price_change_percentage_200d	price_change_percentage_1y
coin_id
bitcoin	1.08388	7.60278	6.57509	7.67258	-3.25185	83.51840	37.51761
ethereum	0.22392	10.38134	4.80849	0.13169	-12.88890	186.77418	101.96023
tether	-0.21173	0.04935	0.00640	-0.04237	0.28037	-0.00542	0.01954
ripple	-0.37819	-0.60926	2.24984	0.23455	-17.55245	39.53888	-16.60193
bitcoin-cash	2.90585	17.09717	14.75334	15.74903	-13.71793	21.66042	14.49384

	price_change_percentage_24h	price_change_percentage_7d	price_change_percentage_14d	price_change_percentage_30d	price_change_percentage_60d	price_change_percentage_200d	price_change_percentage_1y
count	41.00	41.00	41.00	41.00	41.00	41.00	41.00
mean	-0.27	4.50	0.19	1.55	-0.09	236.54	347.67
std	2.69	6.38	8.38	26.34	47.37	435.23	1247.84
min	-13.53	-6.09	-18.16	-34.71	-44.82	-0.39	-17.57
25%	-0.61	0.05	-5.03	-10.44	-25.91	21.66	0.41
50%	-0.06	3.30	0.11	-0.04	-7.54	83.91	69.69
75%	0.61	7.60	5.51	4.58	0.66	216.18	168.37
max	4.84	20.69	24.24	140.80	223.06	2227.93	7852.09

	price_change_percentage_24h	price_change_percentage_7d	price_change_percentage_14d	price_change_percentage_30d	price_change_percentage_60d	price_change_percentage_200d	price_change_percentage_1y
coin_id
bitcoin	0.508529	0.493193	0.772200	0.235460	-0.067495	-0.355953	-0.251637
ethereum	0.185446	0.934445	0.558692	-0.054341	-0.273483	-0.115759	-0.199352
tether	0.021774	-0.706337	-0.021680	-0.061030	0.008005	-0.550247	-0.282061
ripple	-0.040764	-0.810928	0.249458	-0.050388	-0.373164	-0.458259	-0.295546
bitcoin-cash	1.193036	2.000959	1.760610	0.545842	-0.291203	-0.499848	-0.270317

	price_change_percentage_24h	price_change_percentage_7d	price_change_percentage_14d	price_change_percentage_30d	price_change_percentage_60d	price_change_percentage_200d	price_change_percentage_1y
count	41.00	41.00	41.00	41.00	41.00	41.00	41.00
mean	0.00	0.00	0.00	0.00	0.00	-0.00	0.00
std	1.01	1.01	1.01	1.01	1.01	1.01	1.01
min	-4.98	-1.68	-2.22	-1.39	-0.96	-0.55	-0.30
25%	-0.13	-0.71	-0.63	-0.46	-0.55	-0.50	-0.28
50%	0.08	-0.19	-0.01	-0.06	-0.16	-0.36	-0.23
75%	0.33	0.49	0.64	0.12	0.02	-0.05	-0.15
max	1.92	2.57	2.91	5.35	4.77	4.63	6.09

	price_change_percentage_24h	price_change_percentage_7d	k_3	k_4	k_5
coin_id
bitcoin	0.508529	0.493193	1	3	1
ethereum	0.185446	0.934445	1	3	1
tether	0.021774	-0.706337	1	1	4
ripple	-0.040764	-0.810928	1	1	4
bitcoin-cash	1.193036	2.000959	1	3	1
binancecoin	0.891871	1.327295	1	3	1
chainlink	0.011397	2.572251	1	3	1
cardano	0.102530	1.508001	1	3	1
litecoin	0.077497	0.334297	1	3	4
bitcoin-cash-sv	0.448952	-0.190684	1	1	4
crypto-com-chain	0.331280	-1.614844	1	1	0
usd-coin	0.034352	-0.733026	1	1	4
eos	0.155710	-0.922491	1	1	4
monero	0.262723	1.792602	1	3	1
tron	0.130050	-0.041018	1	1	4
tezos	-0.151583	0.708196	1	3	4
okb	-0.923203	-1.437359	1	1	0
stellar	-0.277543	-0.385209	1	1	4
cosmos	-0.255978	1.840274	1	3	1
cdai	0.180851	-0.704931	1	1	4
neo	0.286546	-0.326301	1	1	0
wrapped-bitcoin	0.515453	0.461843	1	3	1
leo-token	0.051758	-0.928381	1	1	0
huobi-token	-0.052032	-0.457229	1	1	4
nem	-0.217984	-0.849381	1	1	4
binance-usd	0.061339	-0.706669	1	1	4
iota	0.259097	0.249508	1	1	4
vechain	0.585089	-0.994231	1	1	0
zcash	-0.127467	0.929119	1	3	1
theta-token	-1.612188	-1.682027	1	1	0
dash	-0.296940	0.094763	1	1	4
ethereum-classic	-0.071312	-0.229484	1	1	4
ethlend	-4.981042	-0.045178	2	0	3
maker	-0.125168	0.580730	1	3	4
havven	-1.428574	-0.025510	1	1	0
omisego	1.919812	0.370447	1	1	0
celsius-degree-token	1.045530	-0.618328	0	2	2
ontology	-0.409044	-0.906963	1	1	0
ftx-token	0.414711	0.414044	1	1	4
true-usd	0.078038	-0.687745	1	1	4
digibyte	1.217453	-0.607714	1	1	0

1. Project Overview

1.1. K-Means

2.1. Principal Component Analysis (PCA)

2. Data Preparation

3. Analysis and Result

1.3. Find the Best Value for k Using the Original Data.

2.3. Cluster Cryptocurrencies with K-means Using the Original Data.

3.3. Optimize Clusters with Principal Component Analysis.

4.3. Find the Best Value for k Using the PCA Data.

5.3. Cluster Cryptocurrencies with K-means Using the PCA Data.

4. Conclusion

References

	PCA1	PCA2	PCA3
coin_id
bitcoin	-0.600667	0.842760	0.461595
ethereum	-0.458261	0.458466	0.952877
tether	-0.433070	-0.168126	-0.641752
ripple	-0.471835	-0.222660	-0.479053
bitcoin-cash	-1.157800	2.041209	1.859715

	PCA1	PCA2	PCA3	k_3	k_4
coin_id
bitcoin	-0.600667	0.842760	0.461595	1	1
ethereum	-0.458261	0.458466	0.952877	1	1
tether	-0.433070	-0.168126	-0.641752	1	0
ripple	-0.471835	-0.222660	-0.479053	1	0
bitcoin-cash	-1.157800	2.041209	1.859715	1	1
binancecoin	-0.516534	1.388377	0.804071	1	1
chainlink	-0.450711	0.517699	2.846143	1	1
cardano	-0.345600	0.729439	1.478013	1	1
litecoin	-0.649468	0.432165	0.600303	1	1
bitcoin-cash-sv	-0.759014	-0.201200	-0.217653	1	0
crypto-com-chain	-0.248198	-1.376252	-1.462026	1	0
usd-coin	-0.438408	-0.175337	-0.663388	1	0
eos	-0.693425	-0.473815	-0.527597	1	0
monero	0.060499	2.909404	1.498571	1	1
tron	-0.393352	-0.108192	-0.012756	1	0
tezos	-0.796176	-0.494409	1.082812	1	1
okb	0.064075	-1.269825	-1.098829	1	0
stellar	-0.489015	-0.732719	-0.062543	1	0
cosmos	-0.306272	0.703415	1.714224	1	1
cdai	-0.513528	-0.142802	-0.656566	1	0
neo	-0.362120	-0.986914	-0.728752	1	0
wrapped-bitcoin	-0.604265	0.827398	0.439316	1	1
leo-token	-0.413296	-0.674115	-1.076628	1	0
huobi-token	-0.407483	-0.212507	-0.351426	1	0
nem	0.608974	0.563532	-1.148742	1	0
binance-usd	-0.450211	-0.151019	-0.647401	1	0
iota	-0.764665	-0.517886	0.204990	1	0
vechain	-0.556315	-1.938209	-1.261776	1	0
zcash	-0.425147	0.492976	1.058048	1	1
theta-token	2.676868	-0.013954	-1.965207	1	0
dash	-0.613923	-0.479337	0.339565	1	0
ethereum-classic	-0.579924	-0.356334	-0.114942	1	0
ethlend	8.089018	-3.896891	2.301382	2	2
maker	-0.389045	0.165041	0.379414	1	1
havven	0.865762	-2.261882	0.275583	1	0
omisego	0.111675	0.428316	-1.205398	1	0
celsius-degree-token	4.792395	6.767679	-1.986985	0	3
ontology	-0.632355	-2.108117	-0.652227	1	0
ftx-token	-0.593142	0.021485	0.209911	1	0