Commit f33b21f6 by Paktalin

applied K Means for the verbs dataset (not working yet)

parent 29f76cce
from estnltk import Text from estnltk import Text
from util import save_dict, load_dict, save_csv, read_csv from util import save_dict, load_dict, save_csv, read_csv
from k_means import plot_k_means
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
...@@ -71,9 +72,26 @@ def construct_df_of_verbs(initial_df): ...@@ -71,9 +72,26 @@ def construct_df_of_verbs(initial_df):
i += 1 i += 1
verbs_df = pd.DataFrame.from_dict(rows) verbs_df = pd.DataFrame.from_dict(rows)
verbs_df = verbs_df[verbs_df['number_of_samples'] != 0] verbs_df = verbs_df[verbs_df['number_of_samples'] != 0]
verbs_df = verbs_df.fillna(0)
print(verbs_df)
save_csv(verbs_df, 'verbs.csv', sep='~', header=True) save_csv(verbs_df, 'verbs.csv', sep='~', header=True)
print(verbs_df) print(verbs_df)
def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = df.drop(['verb', 'number_of_samples'], axis=1)
columns = X.columns
X = X.values
number_of_samples = df['number_of_samples'].values
number_of_samples = np.array([number_of_samples]*X.shape[1])
number_of_samples = np.swapaxes(number_of_samples, 0, 1)
X = X / number_of_samples
return X, columns
df = read_csv('verbs.csv', sep='~', header=0) df = read_csv('verbs.csv', sep='~', header=0)
print(df) X, columns = transform_df_to_preprocessed_array(df)
\ No newline at end of file K = 5
plot_k_means(X, K, columns)
# df = read_csv('cleaned_dataframe.csv', sep='~')
# df.columns = ['distance', 'noun_like', 'noun_like_form', 'noun_like_pos', 'sentence', 'verb', 'verbs_form']
# construct_df_of_verbs(df)
\ No newline at end of file
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(threshold=np.nan)
def d(u, v): # squared difference
diff = u - v
return diff.dot(diff)
def cost(X, R, M):
cost = 0
for k in range(len(M)):
diff = X - M[k]
sq_distances = (diff * diff).sum(axis=1)
cost += (R[:,k] * sq_distances).sum()
return cost
def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
N, D = X.shape
M = np.zeros((K, D)) # means
exponents = np.empty((N, K))
for k in range(K):
M[k] = X[np.random.choice(N)]
costs = np.zeros(max_iter)
for i in range(max_iter):
for k in range(K):
for n in range(N):
exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
R = exponents / exponents.sum(axis=1, keepdims=True)
for k in range(K):
M[k] = R[:,k].dot(X) / R[:,k].sum()
costs[i] = cost(X, R, M)
# if i > 0:
# if np.abs(costs[i] - costs[i-1]) < 1e-5:
# break
if show_plots:
plt.plot(costs)
plt.title("Costs")
plt.show()
random_colors = np.random.random((K, 3))
colors = R.dot(random_colors)
for i in range(X.shape[0]-1):
for j in range(i + 1, X.shape[0]-1):
plt.scatter(X[:,i], X[:,j], c=colors)
plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()
return M, R
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment