Commit b0ad7153 by Paktalin

Removed redundant features

parent 3c6d5faf
...@@ -52,10 +52,15 @@ def clean_dataframe(df): ...@@ -52,10 +52,15 @@ def clean_dataframe(df):
save_csv(df, 'cleaned_dataframe.csv', sep='~') save_csv(df, 'cleaned_dataframe.csv', sep='~')
print(df[df['distance'] > 100]['sentence']) print(df[df['distance'] > 100]['sentence'])
def add_value_to_dict(value, dictionary, distance): def add_value_to_dict(key, dictionary, distance):
if not value in dictionary: if type(key) is str:
dictionary[value] = 0 if 'sg ' in key:
dictionary[value] += 1 / distance key = key.replace('sg ', 'sg/pl ')
elif 'pl ' in key:
key = key.replace('pl ', 'sg/pl ')
if not key in dictionary:
dictionary[key] = 0
dictionary[key] += 1 / distance
def construct_df_of_verbs(initial_df): def construct_df_of_verbs(initial_df):
...@@ -80,8 +85,7 @@ def construct_df_of_verbs(initial_df): ...@@ -80,8 +85,7 @@ def construct_df_of_verbs(initial_df):
print(verbs_df) print(verbs_df)
def transform_df_to_preprocessed_array(df): # divide by the number of samples def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = df.drop(['verb', 'number_of_samples'], axis=1) X = drop_redundant_features(df)
remove_unpopular_features(X)
columns = X.columns columns = X.columns
X = X.values X = X.values
number_of_samples = df['number_of_samples'].values number_of_samples = df['number_of_samples'].values
...@@ -90,28 +94,28 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples ...@@ -90,28 +94,28 @@ def transform_df_to_preprocessed_array(df): # divide by the number of samples
X = X / number_of_samples X = X / number_of_samples
return X, columns return X, columns
def remove_unpopular_features(df): def drop_redundant_features(df):
df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat'], axis=1) df = df.drop(['verb', 'number_of_samples'], axis=1)
# print(df[df['ksite'] != 0]['ksite']) df = drop_verb_forms(df)
# print(df[df['neg ge'] != 0]['neg ge']) df = drop_parts_of_speech(df)
# print(df[df['nud'] != 0]['nud']) # remove rare features
print(df[df['nuks'] != 0]['nuks']) # df = df.drop(['b|vad', 'gu', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nuksin', 'tav', 'tud', 'neg gem', 'n|sin', 'tavat|vat', 'tama', 'me|sime', 'tav|v', 'ksite', 'neg ge', 'nud', 'nuks', 'v'], axis=1)
# print(df['nuksin']) print(df.columns)
# print(df['tav']) return df
# print(df['tud'])
# print(df['v']) def drop_verb_forms(df):
# print(df['Unnamed: 84']) df = df.drop(['b', 'd', 'da', 'des', 'ks', 'ksid', 'ma', 'me', 's', 'sid', 'ta', 'vad', 'b|vad', 'ge', 'gem', 'gu', 'ksime', 'ksin', 'ksite', 'maks', 'mas', 'mast', 'mata', 'n', 'neg ge', 'neg ks', 'neg me', 'neg nud', 'neg o', 'neg vat', 'nud', 'nuks', 'nuksin', 'o', 'sime', 'sin', 'site', 'taks', 'takse', 'tav', 'te', 'ti', 'tud', 'v', 'vat', 'neg gem', 'n|sin', 'ma|tama', 'tavat|vat', 'tama', 'me|sime', 'tav|v'], axis=1)
# print(df['neg gem']) return df
# print(df['n|sin'])
# print(df['tavat|vat']) def drop_parts_of_speech(df):
# print(df['tama']) df = df.drop(['A', 'H', 'N', 'O', 'P', 'S', 'U', 'Y'], axis=1)
# print(df['me|sime']) return df
# print(df['tav|v'])
df = read_csv('verbs.csv', sep='~', header=0) df = read_csv('verbs.csv', sep='~', header=0)
X, columns = transform_df_to_preprocessed_array(df) X, columns = transform_df_to_preprocessed_array(df)
# K = 5 K = 5
# plot_k_means(X, K, columns) plot_k_means(X, K, columns)
# df = read_csv('cleaned_dataframe.csv', sep='~') # df = read_csv('cleaned_dataframe.csv', sep='~')
......
...@@ -50,8 +50,8 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True): ...@@ -50,8 +50,8 @@ def plot_k_means(X, K, columns, max_iter=20, beta=1.0, show_plots=True):
random_colors = np.random.random((K, 3)) random_colors = np.random.random((K, 3))
colors = R.dot(random_colors) colors = R.dot(random_colors)
for i in range(X.shape[0]-1): for i in range(X.shape[1]-1):
for j in range(i + 1, X.shape[0]-1): for j in range(i + 1, X.shape[1]-1):
plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9) plt.scatter(X[:,i], X[:,j], c=colors, s=7, alpha=0.9)
plt.xlabel(columns[i]) plt.xlabel(columns[i])
plt.ylabel(columns[j]) plt.ylabel(columns[j])
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment