Commit 60a137b8 by Paktalin

finished sentiment analysis exercise

parent d4b4a345
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
def my_tokenizer(s):
s = s.lower()
tokens = nltk.tokenize.word_tokenize(s)
tokens = [t for t in tokens if len(t) > 2]
tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
tokens = [t for t in tokens if t not in stop_words]
return tokens
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index_map) + 1)
for t in tokens:
i = word_index_map[t]
x[i] += 1
x = x / x.sum()
x[-1] = label
return x
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(w.rstrip() for w in open('stop_words.txt'))
positive_reviews = BeautifulSoup(open('files/sorted_data_acl/electronics/positive.review').read(), 'lxml')
positive_reviews = positive_reviews.findAll('review_text')
negative_reviews = BeautifulSoup(open('files/sorted_data_acl/electronics/negative.review').read(), 'lxml')
negative_reviews = negative_reviews.findAll('review_text')
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
for review in positive_reviews:
tokens = my_tokenizer(review.text)
positive_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
for review in negative_reviews:
tokens = my_tokenizer(review.text)
negative_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1
for tokens in negative_tokenized:
xy = tokens_to_vector(tokens, 0)
data[i,:] = xy
i += 1
np.random.shuffle(data)
X = data[:, :-1]
Y = data[:, -1]
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print('Classification rate for train data:', model.score(Xtrain, Ytrain))
print('Classification rate for test data:', model.score(Xtest, Ytest))
threshold = 0.5
for word, index in word_index_map.items():
weight = model.coef_[0][index]
if weight > threshold or weight < -threshold:
print(word, weight)
\ No newline at end of file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
def train_test_split(X, Y, test_size):
test_size = int(X.shape[0]*0.33)
Xtrain = X[:-test_size]
Xtest = X[-test_size:]
Ytrain = Y[:-test_size]
Ytest = Y[-test_size:]
return Xtrain, Xtest, Ytrain, Ytest
def visualize(label):
words = ''
for msg in df[df['labels'] == label]['data']:
msg = msg.lower()
words += msg + ' '
word_cloud = WordCloud(width=600, height=400).generate(words)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
df = pd.read_csv('./files/sms_spam.csv', encoding='ISO-8859-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns= ['labels', 'data']
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values
count_vectorizer = CountVectorizer(decode_error='ignore')
X = count_vectorizer.fit_transform(df['data'])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("Train score is", model.score(Xtrain, Ytrain))
print("Test score is", model.score(Xtest, Ytest))
# visualize('spam')
# visualize('ham')
df['predictions'] = model.predict(X)
sneaky_spam = df[(df['predictions'] == 0) & (df['b_labels'] == 1)]['data']
for msg in sneaky_spam:
print(msg)
print('\n\n')
not_actually_spam = df[(df['predictions'] == 1) & (df['b_labels'] == 0)]['data']
for msg in not_actually_spam:
print(msg)
\ No newline at end of file
a
about
above
across
after
again
against
all
almost
alone
along
already
also
although
always
among
an
and
another
any
anybody
anyone
anything
anywhere
are
area
areas
around
as
ask
asked
asking
asks
at
away
b
back
backed
backing
backs
be
became
because
become
becomes
been
before
began
behind
being
beings
best
better
between
big
both
but
by
c
came
can
cannot
case
cases
certain
certainly
clear
clearly
come
could
d
did
differ
different
differently
do
does
done
down
down
downed
downing
downs
during
e
each
early
either
end
ended
ending
ends
enough
even
evenly
ever
every
everybody
everyone
everything
everywhere
f
face
faces
fact
facts
far
felt
few
find
finds
first
for
four
from
full
fully
further
furthered
furthering
furthers
g
gave
general
generally
get
gets
give
given
gives
go
going
good
goods
got
great
greater
greatest
group
grouped
grouping
groups
h
had
has
have
having
he
her
here
herself
high
high
high
higher
highest
him
himself
his
how
howevhowevhoif
important
in
interest
interested
interesting
interests
into
is
it
its
itself
j
just
k
keep
keeps
kind
knew
know
known
knows
l
large
largely
last
later
latest
least
less
let
lets
like
likely
long
longer
longest
m
made
make
making
man
many
may
me
member
members
men
might
more
most
mostly
mr
mrs
much
must
my
myself
n
nnnnnsary
need
needed
needing
needs
never
new
new
newer
newest
next
no
nobody
non
noone
not
nothing
now
nowhere
number
numbers
o
of
off
often
old
older
oldest
on
once
one
only
open
opened
opening
opens
or
order
ordered
ordering
orders
other
others
our
out
over
p
part
parted
parting
parts
per
perhaps
place
places
point
pointed
pointing
points
possible
present
presented
presenting
presents
problem
problems
put
puts
q
quite
r
rather
really
right
right
room
rooms
s
said
same
saw
say
says
second
seconds
see
seem
seemed
seeming
seems
sees
several
shall
she
should
show
showed
showing
shows
side
sides
since
small
smaller
smallest
so
some
somebody
someone
something
somewhere
state
states
still
still
such
sure
t
take
taken
than
that
the
their
them
hen
there
therefore
these
they
thing
things
think
thinks
think
those
though
thought
thoughts
three
through
thus
to
today
together
too
took
toward
turn
turned
turning
turns
two
u
under
until
up
upon
us
use
used
uses
v
very
w
want
wanted
wanting
wants
was
way
ways
we
well
wells
went
were
what
when
where
whether
which
while
who
whole
whose
why
will
with
within
without
work
worked
working
works
would
x
y
year
years
yet
yyyyyyyung
younger
youngest
your
yours
z
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment