|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import hdbscan |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
import matplotlib |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn import manifold |
|
|
from ipywidgets import interact, Output |
|
|
from IPython.display import clear_output |
|
|
|
|
|
|
|
|
from sklearn import manifold |
|
|
from sklearn import decomposition |
|
|
from sklearn import metrics |
|
|
from functools import partial |
|
|
import hdbscan |
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
from matplotlib.pyplot import figure |
|
|
import pandas |
|
|
import networkx as nx |
|
|
|
|
|
|
|
|
from matplotlib.colors import LinearSegmentedColormap |
|
|
from ete3 import Tree, TreeStyle, NodeStyle |
|
|
|
|
|
print ('import_complete') |
|
|
|
|
|
|
|
|
FINGERPRINT_LENGTH = 60 |
|
|
|
|
|
|
|
|
|
|
|
FINGERPRINT_NAME = "all_k_branches_histogram_-8_to_8".format(FINGERPRINT_LENGTH) |
|
|
|
|
|
|
|
|
PERPLEXITY = 30 |
|
|
FLAT_ONLY = True |
|
|
BORING_COLUMNS = ["flat_segments", "flatness_score", "binary_flatness", "horz_flat_seg", "exfoliation_eg", "A", "B", "C", "D", "E", "F"] |
|
|
INPUT_NAME = f"{FINGERPRINT_NAME}_perplexity_{PERPLEXITY}_length_{FINGERPRINT_LENGTH}.csv" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv(f"{INPUT_NAME}", index_col="ID") |
|
|
if FLAT_ONLY: |
|
|
df = df[df.horz_flat_seg>0] |
|
|
df.head() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MS=4 |
|
|
SS=3 |
|
|
|
|
|
fingerprint_cols = [str(i) for i in range(FINGERPRINT_LENGTH)] |
|
|
BORING_COLUMNS += fingerprint_cols |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,\ |
|
|
gen_min_span_tree=True, leaf_size=40, metric='minkowski', cluster_selection_method='leaf', min_cluster_size=4, min_samples=3, p=0.2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db = clusterer.fit(df[fingerprint_cols]) |
|
|
labels = db.labels_ |
|
|
|
|
|
df["labels"] = db.labels_ |
|
|
df["member_strength"] = db.probabilities_ |
|
|
print(len(df[df.labels==-1])) |
|
|
|
|
|
|
|
|
cond_tree=db.condensed_tree_ |
|
|
plot_obj=cond_tree.get_plot_data() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import matplotlib |
|
|
cmap = plt.cm.get_cmap('turbo') |
|
|
norm = matplotlib.colors.Normalize(vmin=min(labels), vmax=max(labels)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
panda_data=cond_tree.to_pandas() |
|
|
|
|
|
|
|
|
selected_clusters=cond_tree._select_clusters() |
|
|
G1 = panda_data[panda_data['child_size'] > 1] |
|
|
|
|
|
|
|
|
|
|
|
len_G1=[] |
|
|
cluster_id=[] |
|
|
for ind1 in G1.index: |
|
|
len_G1.append(0.1) |
|
|
if G1.at[ind1,'child'] in selected_clusters: |
|
|
cluster_id.append(str(selected_clusters.index(G1.at[ind1,'child']))) |
|
|
else: |
|
|
cluster_id.append('-1') |
|
|
print(cluster_id) |
|
|
G1.insert(4, 'dist_G1', len_G1) |
|
|
G1.insert(5, 'cluster_id', cluster_id) |
|
|
G2=G1.copy() |
|
|
print(G2) |
|
|
del G1['cluster_id'] |
|
|
del G1['lambda_val'] |
|
|
del G1['child_size'] |
|
|
g1_list=G1.values.tolist() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tree = Tree.from_parent_child_table(g1_list) |
|
|
|
|
|
print(G2) |
|
|
for node in tree.traverse(): |
|
|
nstyle = NodeStyle() |
|
|
if node.is_leaf(): |
|
|
index1=G2.index[G2['child'] == int(node.name)] |
|
|
node.name=G2.at[index1[0],'cluster_id'] |
|
|
|
|
|
|
|
|
|
|
|
nstyle["fgcolor"] = str(matplotlib.colors.rgb2hex(cmap(norm(int(node.name))))) |
|
|
|
|
|
nstyle["size"] = G2.at[index1[0],'child_size']/2 |
|
|
else: |
|
|
nstyle["fgcolor"] ='black' |
|
|
node.set_style(nstyle) |
|
|
tree.write(format=1,outfile='new_tree.nw') |
|
|
|
|
|
|
|
|
ts = TreeStyle() |
|
|
ts.mode='c' |
|
|
ts.arc_start = -180 |
|
|
ts.arc_span = 360 |
|
|
ts.scale = 40 |
|
|
ts.show_leaf_name=True |
|
|
tree.show(tree_style=ts) |
|
|
|
|
|
|
|
|
|