C. Wolf et al. 2004, Astron. & Astrophys. http://www.mpia.de/COMBO/combo_index.html
We will perform the following analysis steps:
print(__doc__)
import numpy as np
import pandas as pd
# PCA and Factor Analysis
from sklearn.decomposition import PCA,FactorAnalysis
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.preprocessing import StandardScaler
# K-mean algorithm
from sklearn.cluster import KMeans
# Local Outlier Factor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from scipy.stats import rv_discrete
from scipy.stats import spearmanr,ks_2samp, chi2_contingency, anderson
from scipy.stats import probplot
np.random.seed(42)
# Read the data in csv format
df=pd.read_csv('COMBO17.csv', header=0)
# List the objects
df.head()
# select 8 columns
df_1=df[['Rmag', 'mumax', 'Mcz', 'MCzml', 'chi2red', 'UjMAG',
'BjMAG', 'VjMAG']]
df_1.head()
# Summary statistics
df_1.describe()
corr = df_1.corr().mul(100).astype(int)
sns.clustermap(data=corr, annot=True, fmt='d', cmap='Greens')
plt.show()
Strong correlations are found for:
Correlations measure the strengths of linear relationships between variables if such relationships are valid.
We perform a Principle Component Analysis of all the variables excluding the errors
# Select the columns by removing the erros and remove the instances
# (objects) with NaN
df_2 = df[['Rmag','ApDRmag','mumax','Mcz','MCzml','chi2red',
'UjMAG','BjMAG','VjMAG','usMAG','gsMAG','rsMAG','UbMAG','BbMAG','VnMAG','S280MAG','W420FE','W462FE','W485FD','W518FE','W571FS','W604FE','W646FD','W696FE','W753FE','W815FS','W856FD','W914FD','W914FE','UFS','BFS','VFD','RFS','IFD']]
df_2 = df_2.dropna()
df_2.head()
corr = df_2.corr().mul(100).astype(int)
clustergrid = sns.clustermap(data=corr, annot=True, fmt='d',
cmap='Greens')
plt.show()
idx=clustergrid.dendrogram_col.reordered_ind
df_g1=df_2.iloc[:,idx[0:17]]
df_g2=df_2.iloc[:,idx[17:19]]
df_g3=df_2.iloc[:,idx[19:29]] # The colors are highly correlated
df_g4=df_2.iloc[:,idx[29:32]]
df_g5=df_2.iloc[:,idx[32:35]]
X_g1 = np.array(df_g1)
pca_g1=PCA()
pca_g1.fit(X_g1)
Xpca_g1 = pca_g1.transform(X_g1)
pca_components_g1=pd.DataFrame(pca_g1.components_)
pca_explained_variance_g1=pd.DataFrame(pca_g1.explained_variance_ratio_*100.)
pca_explained_variance_g1
df_g1.columns
pca_components_g1.iloc[:,0:5]
X_g2 = np.array(df_g2)
pca_g2=PCA()
pca_g2.fit(X_g2)
Xpca_g2 = pca_g2.transform(X_g2)
pca_components_g2=pd.DataFrame(pca_g2.components_)
pca_explained_variance_g2=pd.DataFrame(pca_g2.explained_variance_ratio_*100.)
pca_explained_variance_g2
X_g3 = np.array(df_g3)
pca_g3=PCA()
pca_g3.fit(X_g3)
Xpca_g3 = pca_g3.transform(X_g3)
pca_components_g3=pd.DataFrame(pca_g3.components_)
pca_explained_variance_g3=pd.DataFrame(pca_g3.explained_variance_ratio_*100.)
pca_explained_variance_g3
X_g4 = np.array(df_g4)
pca_g4=PCA()
pca_g4.fit(X_g4)
Xpca_g4 = pca_g4.transform(X_g4)
pca_components_g4=pd.DataFrame(pca_g4.components_)
pca_explained_variance_g4=pd.DataFrame(pca_g4.explained_variance_ratio_*100.)
pca_explained_variance_g4
df_g4.columns
# Analyse the PCA transformed data: we have now 6 PCA + original features
Xpca=np.vstack((Xpca_g1[:,0],Xpca_g2[:,0],Xpca_g3[:,0],
Xpca_g4[:,0],df_2.iloc[:,idx[32:35]].T)).T
df_Xpca=pd.DataFrame(Xpca)
corr_Xpca = df_Xpca.corr().mul(100).astype(int)
clustergrid = sns.clustermap(data=corr_Xpca, annot=True, fmt='d',
cmap='Greens')
plt.show()
# select low-redshift z<0.3 ('Mcz') MB ('BjMAG')
# and M280 ('S280MAG') p 242 Feigelson's textbook
df_3=df_2[['Mcz','BjMAG','S280MAG']]
df_3 = df_3[df_3['Mcz'] < 0.3]
x_3 = df_3['BjMAG']
y_3 = df_3['S280MAG']-df_3['BjMAG']
# Two-dimensional kernel-density estimator
# input x,y are the locations of the points, here the galaxies
# return a density map at grid locations xx,yy
from sklearn.neighbors import KernelDensity
def kde2D(x,y,bandwidth=0.1,xbins=100j,ybins=100j,**kwargs):
# create grid of sample locatios (default: 100x100)
xx,yy = np.mgrid[x.min():x.max():xbins,
y.min():y.max():ybins]
xy_sample = np.vstack([yy.ravel(),xx.ravel()]).T
xy_train = np.vstack([y,x]).T
kde_skl = KernelDensity(bandwidth=bandwidth,**kwargs)
kde_skl.fit(xy_train)
# score_Samples() returns the log_likelihood of the samples
z = np.exp(kde_skl.score_samples(xy_sample))
return xx,yy,np.reshape(z,xx.shape)
from astroML.density_estimation import KNeighborsDensity
#help("astroML.density_estimation.KNeighborsDensity")
def knd2D(x,y,method='bayesian',nneighbours=5,xbins=100j,ybins=100j):
# create grid of sample locatios (default: 100x100)
xx,yy = np.mgrid[x.min():x.max():xbins,
y.min():y.max():ybins]
xy_sample = np.vstack([yy.ravel(),xx.ravel()]).T
xy_train = np.vstack([y,x]).T
knn = KNeighborsDensity(method,nneighbours)
z = knn.fit(xy_train).eval(xy_sample)
return xx,yy,np.reshape(z,xx.shape)
def density_plots():
splot= plt.subplots(figsize=(12, 7))
splot = plt.subplot(121)
plt.scatter(x_3,y_3,marker='.')
plt.xlabel('B (mag)')
plt.ylabel('S280-B (mag)')
plt.title("COMBO 17")
splot = plt.subplot(122)
plt.pcolormesh(xx,yy,zz,cmap=plt.cm.binary)
plt.xlabel('B (mag)')
plt.ylabel('S280-B (mag)')
plt.title("COMBO 17")
plt.show()
xx,yy,zz = kde2D(x_3,y_3,bandwidth=0.15,xbins=150j,ybins=150j)
density_plots()
xx,yy,zz = knd2D(x_3,y_3,nneighbours=20,xbins=150j,ybins=150j)
density_plots()
col_list = ['blue','red','green','darkgoldenrod','darkgreen',
'darkmagenta','silver','darkorange','gold',
'darkorchid','aqua','cadetblue',
'darkolivegreen','burlywood','chartreuse',
'chocolate','coral','cornflowerblue','black',
'darkkhaki','pink','moccasin','limegreen']
def plot_clustering(title):
plt.subplots(figsize=(8, 5))
plt.subplot(111)
plt.scatter(x_3,y_3,alpha=0.5,marker=".")
plt.title(title)
for i in range(n_clusters_):
plt.scatter(x_3[labels == i],y_3[labels == i],
alpha=0.5,marker=".",color=col_list[i])
plt.xlabel('B (mag)')
plt.ylabel('S280-B (mag)')
plt.show()
from sklearn.preprocessing import StandardScaler
X_3 = np.vstack((x_3,y_3))
X_3=X_3.T
X_3s = StandardScaler().fit_transform(X_3)
from sklearn.cluster import AgglomerativeClustering
#help("sklearn.cluster.AgglomerativeClustering")
# affinity: "euclidean", "l1", "l2", "manhattan", "cosine", or 'precomputed'
n_clusters_ = 5
ACmodel = AgglomerativeClustering(n_clusters=n_clusters_,
affinity='euclidean',
linkage='complete')
ACmodel.fit(X_3s)
labels = ACmodel.labels_
plot_clustering("Agglomerative Clustering")
from sklearn.cluster import DBSCAN
#help("sklearn.cluster.DBSCAN")
The dbscan function requires user input of two parameters: the minimum number of points in a cluster, and the maximum radius (or reach) of a cluster. By trial-and-error, Feigelson found that a minimum of five points within 0.3 standardized magnitude units provided a useful result.
db = DBSCAN(eps=0.25,min_samples=25).fit(X_3s)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
plot_clustering("DBSCAN")
from sklearn.cluster import KMeans
n_clusters_ = 2
k_means = KMeans(n_clusters=n_clusters_)
k_means.fit(X_3s)
labels = k_means.labels_
plot_clustering('k-means')
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import silhouette_score
# Compute Affinity Propagation
af = AffinityPropagation(damping=.9,preference=-200).fit(X_3s)
cluster_centers_indices = af.cluster_centers_indices_
af_labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
% silhouette_score(X_3s, af_labels, metric='sqeuclidean'))
labels = af_labels
plot_clustering("Affinity Propagation")