Removed redundant features

b0ad7153 · Paktalin · 3c6d5faf · b0ad7153 · b0ad7153 · b0ad7153
Commit b0ad7153 authored Dec 16, 2018 by Paktalin
Showing with 31 additions and 27 deletions
__pycache__/k_means.cpython-36.pyc
estnltk_preprocessing.py
k_means.py
verbs.csv
--- a/__pycache__/k_means.cpython-36.pyc
+++ b/__pycache__/k_means.cpython-36.pyc
--- a/estnltk_preprocessing.py
+++ b/estnltk_preprocessing.py
@@ -52,10 +52,15 @@ def clean_dataframe(df):
 	save_csv(df, 'cleaned_dataframe.csv', sep='~')
 	print(df[df['distance'] > 100]['sentence'])

-def add_value_to_dict(value, dictionary, distance):
-	if not value in dictionary:
-		dictionary[value] = 0
-	dictionary[value] += 1 / distance
+def add_value_to_dict(key, dictionary, distance):
+	if type(key) is str:
+		if 'sg ' in key:
+			key = key.replace('sg ', 'sg/pl ')
+		elif 'pl ' in key:
+			key = key.replace('pl ', 'sg/pl ')
+		if not key in dictionary:
+			dictionary[key] = 0
+		dictionary[key] += 1 / distance


 def construct_df_of_verbs(initial_df):
@@ -80,8 +85,7 @@ def construct_df_of_verbs(initial_df):
 	print(verbs_df)

 def transform_df_to_preprocessed_array(df): # divide by the number of samples
-	X = df.drop(['verb', 'number_of_samples'], axis=1)
-	remove_unpopular_features(X)
+	X = drop_redundant_features(df)
 	columns = X.columns
 	X = X.values
 	number_of_samples = df['number_of_samples'].values
@@ -90,28 +94,28 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples
 	X = X / number_of_samples
 	return X, columns

-def remove_unpopular_features(df):
-	df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat'], axis=1)
-	# print(df[df['ksite'] != 0]['ksite'])
-	# print(df[df['neg ge'] != 0]['neg ge'])
-	# print(df[df['nud'] != 0]['nud'])
-	print(df[df['nuks'] != 0]['nuks'])
-	# print(df['nuksin'])
-	# print(df['tav'])
-	# print(df['tud'])
-	# print(df['v'])
-	# print(df['Unnamed: 84'])
-	# print(df['neg gem'])
-	# print(df['n|sin'])
-	# print(df['tavat|vat'])
-	# print(df['tama'])
-	# print(df['me|sime'])
-	# print(df['tav|v'])
+def drop_redundant_features(df):
+	df = df.drop(['verb', 'number_of_samples'], axis=1) 
+	df = drop_verb_forms(df)
+	df = drop_parts_of_speech(df)
+	# remove rare features
+	# df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nuksin', 'tav', 'tud', 'neg gem', 'n|sin', 'tavat|vat', 'tama', 'me|sime', 'tav|v', 'ksite', 'neg ge', 'nud', 'nuks', 'v'], axis=1)
+	print(df.columns)
+	return df
+
+def drop_verb_forms(df):
+	df = df.drop(['b', 'd', 'da', 'des', 'ks', 'ksid', 'ma', 'me', 's', 'sid', 'ta', 'vad', 'b|vad', 'ge', 'gem', 'gu', 'ksime', 'ksin', 'ksite', 'maks', 'mas', 'mast', 'mata', 'n', 'neg ge', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nud', 'nuks', 'nuksin', 'o', 'sime', 'sin', 'site', 'taks', 'takse', 'tav', 'te', 'ti', 'tud', 'v', 'vat', 'neg gem', 'n|sin', 'ma|tama', 'tavat|vat', 'tama', 'me|sime', 'tav|v'], axis=1)
+	return df
+
+def drop_parts_of_speech(df):
+	df = df.drop(['A', 'H', 'N', 'O', 'P', 'S', 'U', 'Y'], axis=1)
+	return df
+

 df = read_csv('verbs.csv', sep='~', header=0)
 X, columns = transform_df_to_preprocessed_array(df)
-# K = 5
-# plot_k_means(X, K, columns)
+K = 5
+plot_k_means(X, K, columns)


 # df = read_csv('cleaned_dataframe.csv', sep='~')

--- a/k_means.py
+++ b/k_means.py
@@ -50,8 +50,8 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):

 		random_colors = np.random.random((K, 3))
 		colors = R.dot(random_colors)
-		for i in range(X.shape[0]-1):
-			for j in range(i + 1, X.shape[0]-1):
+		for i in range(X.shape[1]-1):
+			for j in range(i + 1, X.shape[1]-1):
 				plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9)
 				plt.xlabel(columns[i])
 				plt.ylabel(columns[j])

--- a/verbs.csv
+++ b/verbs.csv