In [1]:
import nltk
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
In [2]:
complaints = pd.read_csv("Data/complaint1700.csv")
noncomplaints = pd.read_csv("Data/noncomplaint1700.csv")
In [3]:
complaints['category']= "complaints"
In [4]:
noncomplaints['category']= "noncomplaints"
In [5]:
df = pd.concat([complaints,noncomplaints],axis=0,ignore_index=True)
In [207]:
df.head()
Out[207]:
id airline tweet category
0 80938 United @united I'm having issues. Yesterday I rebooke... complaints
1 10959 United @united kinda feel like the $6.99 you charge f... complaints
2 130813 SouthWest Livid in Vegas, delayed, again& again&... complaints
3 146589 United @united the most annoying man on earth is on m... complaints
4 117579 United @united The last 2 weeks I've flown wit u, you... complaints
In [6]:
X = df['tweet']
y = df['category']
In [7]:
#without tdif
In [8]:
cv = CountVectorizer()
In [9]:
X = cv.fit_transform(X)
In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

Naive bayes without tfidf

In [11]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
In [12]:
nb.fit(X_train,y_train)
Out[12]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [94]:
predictions = nb.predict(X_test)
In [48]:
from sklearn.metrics import classification_report, confusion_matrix
In [49]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))#BEST MODEL
[[409 113]
 [167 331]]


               precision    recall  f1-score   support

   complaints       0.71      0.78      0.74       522
noncomplaints       0.75      0.66      0.70       498

     accuracy                           0.73      1020
    macro avg       0.73      0.72      0.72      1020
 weighted avg       0.73      0.73      0.72      1020

In [160]:
from sklearn.naive_bayes import ComplementNB
In [161]:
cnb = ComplementNB()
In [170]:
cnb.fit(X_train,y_train)
Out[170]:
ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
In [171]:
predictions = nb.predict(X_test)
In [172]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
[[409 113]
 [167 331]]


               precision    recall  f1-score   support

   complaints       0.71      0.78      0.74       522
noncomplaints       0.75      0.66      0.70       498

     accuracy                           0.73      1020
    macro avg       0.73      0.72      0.72      1020
 weighted avg       0.73      0.73      0.72      1020

In [ ]:
 

Random Forest without tfidf

In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
In [127]:
rf = RandomForestClassifier()
In [33]:
rf.fit(X_train,y_train)
/Users/kevweirikefe/opt/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[33]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [34]:
rf_predictions = rf.predict(X_test)
In [35]:
print(confusion_matrix(y_test,rf_predictions))
print('\n')
print(classification_report(y_test,rf_predictions))
[[387 135]
 [180 318]]


               precision    recall  f1-score   support

   complaints       0.68      0.74      0.71       522
noncomplaints       0.70      0.64      0.67       498

     accuracy                           0.69      1020
    macro avg       0.69      0.69      0.69      1020
 weighted avg       0.69      0.69      0.69      1020

In [130]:
param_grid = {'n_estimators':[5,10,20,30],
               'max_depth':[1,3,5,7,9,11,15]}
scores = ['precision']
In [133]:
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='%s_macro' % scores[0])
grid_search.fit(X_train,y_train)
print("Best Hyperparameter is:")
print(grid_search.best_params_)
Best Hyperparameter is:
{'max_depth': 15, 'n_estimators': 30}
In [135]:
print("Results are:")
means = grid_search.cv_results_['mean_test_score']
for mean,params in zip(means, grid_search.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
Results are:
0.624 for {'max_depth': 1, 'n_estimators': 5}
0.612 for {'max_depth': 1, 'n_estimators': 10}
0.635 for {'max_depth': 1, 'n_estimators': 20}
0.640 for {'max_depth': 1, 'n_estimators': 30}
0.593 for {'max_depth': 3, 'n_estimators': 5}
0.635 for {'max_depth': 3, 'n_estimators': 10}
0.662 for {'max_depth': 3, 'n_estimators': 20}
0.670 for {'max_depth': 3, 'n_estimators': 30}
0.621 for {'max_depth': 5, 'n_estimators': 5}
0.637 for {'max_depth': 5, 'n_estimators': 10}
0.678 for {'max_depth': 5, 'n_estimators': 20}
0.691 for {'max_depth': 5, 'n_estimators': 30}
0.638 for {'max_depth': 7, 'n_estimators': 5}
0.660 for {'max_depth': 7, 'n_estimators': 10}
0.684 for {'max_depth': 7, 'n_estimators': 20}
0.693 for {'max_depth': 7, 'n_estimators': 30}
0.643 for {'max_depth': 9, 'n_estimators': 5}
0.666 for {'max_depth': 9, 'n_estimators': 10}
0.696 for {'max_depth': 9, 'n_estimators': 20}
0.703 for {'max_depth': 9, 'n_estimators': 30}
0.642 for {'max_depth': 11, 'n_estimators': 5}
0.666 for {'max_depth': 11, 'n_estimators': 10}
0.688 for {'max_depth': 11, 'n_estimators': 20}
0.706 for {'max_depth': 11, 'n_estimators': 30}
0.642 for {'max_depth': 15, 'n_estimators': 5}
0.680 for {'max_depth': 15, 'n_estimators': 10}
0.710 for {'max_depth': 15, 'n_estimators': 20}
0.720 for {'max_depth': 15, 'n_estimators': 30}

SVM with linear kernel

In [107]:
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")


tuned_parameters = [
  {'kernel': ['linear'], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
  
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(X_train, y_train)

print("Best Hyperparameter is:")
print(clf.best_params_)

    
Best Hyperparameter is:
{'C': 0.1, 'kernel': 'linear'}
In [108]:
print("Result is:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
Result is:
0.723 for {'C': 0.01, 'kernel': 'linear'}
0.745 for {'C': 0.1, 'kernel': 'linear'}
0.727 for {'C': 1.0, 'kernel': 'linear'}
0.718 for {'C': 5.0, 'kernel': 'linear'}
0.718 for {'C': 10.0, 'kernel': 'linear'}
0.718 for {'C': 100.0, 'kernel': 'linear'}

SVC with RBF kernel- best from svc

In [116]:
tuned_parameters = [
  {'kernel': ['rbf'], 'gamma': [0.0001,0.001,0.1,1,10,100],'C': [0.1,1,10,100,1000]}]
  
# Objective metrics
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(X_train, y_train)

print("Best Hyperparameters are:")
print(clf.best_params_)
Best Hyperparameters are:
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
In [118]:
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
Results are:
0.253 for {'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.253 for {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
0.642 for {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
0.253 for {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
0.253 for {'C': 0.1, 'gamma': 10, 'kernel': 'rbf'}
0.253 for {'C': 0.1, 'gamma': 100, 'kernel': 'rbf'}
0.253 for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.697 for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.722 for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.574 for {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.252 for {'C': 1, 'gamma': 10, 'kernel': 'rbf'}
0.252 for {'C': 1, 'gamma': 100, 'kernel': 'rbf'}
0.700 for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.728 for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.739 for {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.544 for {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
0.252 for {'C': 10, 'gamma': 10, 'kernel': 'rbf'}
0.252 for {'C': 10, 'gamma': 100, 'kernel': 'rbf'}
0.728 for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.746 for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.739 for {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.544 for {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
0.252 for {'C': 100, 'gamma': 10, 'kernel': 'rbf'}
0.252 for {'C': 100, 'gamma': 100, 'kernel': 'rbf'}
0.744 for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.730 for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.739 for {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
0.544 for {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
0.252 for {'C': 1000, 'gamma': 10, 'kernel': 'rbf'}
0.252 for {'C': 1000, 'gamma': 100, 'kernel': 'rbf'}

SVC with Polynomial kernel

In [114]:
tuned_parameters = [
  {'kernel': ['poly'], 'degree':[0,1,2,5], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
  
# Objective metrics
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(X_train, y_train)

print("Best Hyperparameter is:")
print(clf.best_params_)
Best Hyperparameter is:
{'C': 100.0, 'degree': 1, 'kernel': 'poly'}
In [115]:
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))
Results are:
0.253 for {'C': 0.01, 'degree': 0, 'kernel': 'poly'}
0.253 for {'C': 0.01, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 0.01, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 0.01, 'degree': 5, 'kernel': 'poly'}
0.253 for {'C': 0.1, 'degree': 0, 'kernel': 'poly'}
0.253 for {'C': 0.1, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 0.1, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 0.1, 'degree': 5, 'kernel': 'poly'}
0.253 for {'C': 1.0, 'degree': 0, 'kernel': 'poly'}
0.253 for {'C': 1.0, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 1.0, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 1.0, 'degree': 5, 'kernel': 'poly'}
0.253 for {'C': 5.0, 'degree': 0, 'kernel': 'poly'}
0.253 for {'C': 5.0, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 5.0, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 5.0, 'degree': 5, 'kernel': 'poly'}
0.253 for {'C': 10.0, 'degree': 0, 'kernel': 'poly'}
0.704 for {'C': 10.0, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 10.0, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 10.0, 'degree': 5, 'kernel': 'poly'}
0.253 for {'C': 100.0, 'degree': 0, 'kernel': 'poly'}
0.727 for {'C': 100.0, 'degree': 1, 'kernel': 'poly'}
0.253 for {'C': 100.0, 'degree': 2, 'kernel': 'poly'}
0.253 for {'C': 100.0, 'degree': 5, 'kernel': 'poly'}
In [119]:
svc = SVC(kernel='rbf',C=100, gamma=0.001)
In [120]:
svc.fit(X_train, y_train)
Out[120]:
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
In [121]:
svc_predictions = svc.predict(X_test)
In [122]:
print(confusion_matrix(y_test,svc_predictions))
print('\n')
print(classification_report(y_test,svc_predictions))
[[358 164]
 [115 383]]


               precision    recall  f1-score   support

   complaints       0.76      0.69      0.72       522
noncomplaints       0.70      0.77      0.73       498

     accuracy                           0.73      1020
    macro avg       0.73      0.73      0.73      1020
 weighted avg       0.73      0.73      0.73      1020

All models With tfidf

In [213]:
def tokenizer(message):
    noPunct = [char for char in message if char not in string.punctuation]
    noPunct = "".join(noPunct)
    return [word for word in noPunct.split() if word.lower not in stopwords.words("english")]
    
In [214]:
from sklearn.pipeline import Pipeline
In [242]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenizer)),
    ('tfidf', TfidfTransformer() ),
    ('classifier', MultinomialNB())
])#naive bayes
In [243]:
X = df['tweet']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)
In [244]:
pipeline.fit(X_train, y_train)
Out[244]:
Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function tokenizer at 0x1a227233b0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)
In [180]:
tfidf_predictions = pipeline.predict(X_test)
In [142]:
print(confusion_matrix(y_test,tfidf_predictions))
print('\n')
print(classification_report(y_test,tfidf_predictions))
[[427  95]
 [194 304]]


               precision    recall  f1-score   support

   complaints       0.69      0.82      0.75       522
noncomplaints       0.76      0.61      0.68       498

     accuracy                           0.72      1020
    macro avg       0.72      0.71      0.71      1020
 weighted avg       0.72      0.72      0.71      1020

In [143]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenizer)),
    ('tfidf', TfidfTransformer() ),
    ('classifier', RandomForestClassifier(max_depth = 15, n_estimators = 30))
])#random forest
In [145]:
pipeline.fit(X_train, y_train)
Out[145]:
Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function tokenizer at 0x1a1eb17170>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tok...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=15,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=30, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)
In [146]:
tfidf_predictions = pipeline.predict(X_test)
In [147]:
print(confusion_matrix(y_test,tfidf_predictions))
print('\n')
print(classification_report(y_test,tfidf_predictions))
[[323 199]
 [137 361]]


               precision    recall  f1-score   support

   complaints       0.70      0.62      0.66       522
noncomplaints       0.64      0.72      0.68       498

     accuracy                           0.67      1020
    macro avg       0.67      0.67      0.67      1020
 weighted avg       0.67      0.67      0.67      1020

In [149]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=tokenizer)),
    ('tfidf', TfidfTransformer() ),
    ('classifier', SVC(kernel='rbf',C=100, gamma=0.001))
])#svc
In [150]:
pipeline.fit(X_train, y_train)
Out[150]:
Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function tokenizer at 0x1a1eb17170>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=0.001,
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)
In [151]:
tfidf_predictions = pipeline.predict(X_test)
In [152]:
print(confusion_matrix(y_test,tfidf_predictions))
print('\n')
print(classification_report(y_test,tfidf_predictions))
[[339 183]
 [114 384]]


               precision    recall  f1-score   support

   complaints       0.75      0.65      0.70       522
noncomplaints       0.68      0.77      0.72       498

     accuracy                           0.71      1020
    macro avg       0.71      0.71      0.71      1020
 weighted avg       0.71      0.71      0.71      1020

In [96]:
predictions[0]
Out[96]:
'complaints'

Final model on test data

In [13]:
import numpy as np
from pandas import DataFrame
In [14]:
myData = pd.read_csv("Data/Goke2.csv")
In [15]:
myData.head()
Out[15]:
id tid_not_to_be_used airline tag tweet
0 20 5.062408e+17 JetBlue iX&x6h27>Zlk @JetBlue two gate changes and delays! I'm so a...
1 64 5.062617e+17 United iX&x6h27>Zlk Worst Txt ever. I hate you @united http://t.co...
2 66 5.062627e+17 SouthWest iX&x6h27>Zlk @SouthwestAir flight 1222 PHX to STL LATE AGAI...
3 69 5.062650e+17 SouthWest iX&x6h27>Zlk Fuck you @SouthwestAir
4 75 5.062711e+17 United iX&x6h27>Zlk @Dean_Bottorff I'll tell you the long story wh...
In [16]:
tweet = myData['tweet']
In [17]:
tweet = cv.transform(tweet)
In [18]:
all_predictions = nb.predict(tweet)
In [19]:
all_predictions.shape
Out[19]:
(4555,)
In [190]:
#log_proba.shape
In [191]:
#log_proba[0:3]
In [20]:
df_pred = DataFrame(data = all_predictions, columns = ["prediction"], index = np.arange(0,4555))
In [21]:
print(df_pred)
      prediction
0     complaints
1     complaints
2     complaints
3     complaints
4     complaints
...          ...
4550  complaints
4551  complaints
4552  complaints
4553  complaints
4554  complaints

[4555 rows x 1 columns]
In [22]:
final_df = pd.merge(myData, df_pred, left_index=True, right_index=True, how="outer")
In [23]:
final_df.head()
Out[23]:
id tid_not_to_be_used airline tag tweet prediction
0 20 5.062408e+17 JetBlue iX&x6h27>Zlk @JetBlue two gate changes and delays! I'm so a... complaints
1 64 5.062617e+17 United iX&x6h27>Zlk Worst Txt ever. I hate you @united http://t.co... complaints
2 66 5.062627e+17 SouthWest iX&x6h27>Zlk @SouthwestAir flight 1222 PHX to STL LATE AGAI... complaints
3 69 5.062650e+17 SouthWest iX&x6h27>Zlk Fuck you @SouthwestAir complaints
4 75 5.062711e+17 United iX&x6h27>Zlk @Dean_Bottorff I'll tell you the long story wh... complaints
In [24]:
non_negative = final_df[final_df['prediction'] == "noncomplaints"]
In [25]:
non_negative #too many non-negative tweets, try with svc model, still best
Out[25]:
id tid_not_to_be_used airline tag tweet prediction
5 158 5.064371e+17 AlaskaAir iX&x6h27>Zlk @kumailn @AlaskaAir I have to say I feel a bit... noncomplaints
6 186 5.064593e+17 VirginAmerica iX&x6h27>Zlk @IIJERiiCHOII @virginamerica not cool noncomplaints
7 209 5.064603e+17 VirginAmerica iX&x6h27>Zlk @IIJERiiCHOII @VirginAmerica that's fucked up noncomplaints
10 363 5.065376e+17 JetBlue iX&x6h27>Zlk On hold with @JetBlue they were playing the do... noncomplaints
12 482 5.065957e+17 United iX&x6h27>Zlk @united cancelled flight 5000 to Syracuse depa... noncomplaints
... ... ... ... ... ... ...
4525 172487 6.049289e+17 United iX&x6h27>Zlk @united why the fuck didn't you give the lady ... noncomplaints
4526 172513 6.049448e+17 United iX&x6h27>Zlk @united I have a business proposal for you. Us... noncomplaints
4534 172924 6.050417e+17 United iX&x6h27>Zlk @omerm27 @united @TaheraHAhmad I am stunned at... noncomplaints
4546 173503 6.051436e+17 United iX&x6h27>Zlk @pierreberastain @jaehornblower @united leave ... noncomplaints
4548 173510 6.051447e+17 United iX&x6h27>Zlk @united Hi, waiting on information re: UA1242.... noncomplaints

1049 rows × 6 columns

In [241]:
tweet = myData['tweet']
In [153]:
all_predictions = svc.predict(tweet)
In [154]:
df_pred = DataFrame(data = all_predictions, columns = ["prediction"], index = np.arange(0,4555))
In [155]:
print(df_pred)
         prediction
0        complaints
1        complaints
2        complaints
3        complaints
4        complaints
...             ...
4550     complaints
4551  noncomplaints
4552     complaints
4553     complaints
4554     complaints

[4555 rows x 1 columns]
In [156]:
final_df = pd.merge(myData, df_pred, left_index=True, right_index=True, how="outer")
In [157]:
non_negative = final_df[final_df['prediction'] == "noncomplaints"]
In [158]:
non_negative
Out[158]:
id airline tweet prediction
6 189 JetBlue Vacation's over. I'd be sad but I'm looking fo... noncomplaints
7 203 VirginAmerica @IIJERiiCHOII @VirginAmerica dude talk about a... noncomplaints
8 215 VirginAmerica @airfarewatchdog flew AUS-SFO on @VirginAmeric... noncomplaints
13 504 SouthWest @abbydarkstar I've never had trouble with @So... noncomplaints
19 645 United @united @sarmient01 cancel his bumass flight too noncomplaints
... ... ... ... ...
4529 172768 VirginAmerica This is @virginamerica helping me un-screw wha... noncomplaints
4539 173066 SouthWest @AnoushahKPRC @SouthwestAir You should hook up... noncomplaints
4542 173082 United this literally never happened @omarsuleiman504... noncomplaints
4544 173134 United @NewPakistan2020 @united I am sorry, It can be... noncomplaints
4551 173539 Delta Oh @DeltaAssist... U have iPads in the termina... noncomplaints

1530 rows × 4 columns

In [245]:
nbpipe_predict=pipeline.predict(myData['tweet'])
In [246]:
df_pred = DataFrame(data = all_predictions, columns = ["prediction"], index = np.arange(0,4555))
In [247]:
final_df = pd.merge(myData, df_pred, left_index=True, right_index=True, how="outer")
In [248]:
non_negative = final_df[final_df['prediction'] == "noncomplaints"]
In [249]:
non_negative
Out[249]:
id airline tweet prediction
6 189 JetBlue Vacation's over. I'd be sad but I'm looking fo... noncomplaints
7 203 VirginAmerica @IIJERiiCHOII @VirginAmerica dude talk about a... noncomplaints
8 215 VirginAmerica @airfarewatchdog flew AUS-SFO on @VirginAmeric... noncomplaints
19 645 United @united @sarmient01 cancel his bumass flight too noncomplaints
21 744 SouthWest @SouthwestAir i just flew airtran and i am ver... noncomplaints
... ... ... ... ...
4532 172863 United Cappuccino and mini scones at the @united Club... noncomplaints
4539 173066 SouthWest @AnoushahKPRC @SouthwestAir You should hook up... noncomplaints
4542 173082 United this literally never happened @omarsuleiman504... noncomplaints
4543 173101 United Islamophobia. @united you guys are disgusting ... noncomplaints
4551 173539 Delta Oh @DeltaAssist... U have iPads in the termina... noncomplaints

1026 rows × 4 columns

In [199]:
!pwd
/Users/kevweirikefe/Documents/Simon Classes/CIS 434 - Social Media Analytics
In [26]:
export_csv = non_negative.to_csv (r'/Users/kevweirikefe/Documents/Simon Classes/CIS 434 - Social Media Analytics/Goke.csv', index = None, header=True)
In [250]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
In [251]:
tweet = myData['tweet']
In [252]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
In [253]:
sentiment_analyzer_scores(tweet)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-253-18b60c392be5> in <module>
----> 1 sentiment_analyzer_scores(tweet)

<ipython-input-252-8b1fa62968b2> in sentiment_analyzer_scores(sentence)
      1 def sentiment_analyzer_scores(sentence):
----> 2     score = analyser.polarity_scores(sentence)
      3     print("{:-<40} {}".format(sentence, str(score)))

~/opt/anaconda3/lib/python3.7/site-packages/vaderSentiment/vaderSentiment.py in polarity_scores(self, text)
    246         """
    247         # convert emojis to their textual descriptions
--> 248         text_token_list = text.split()
    249         text_no_emoji_lst = []
    250         for token in text_token_list:

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'split'
In [257]:
for sentence in tweet:
    print(sentence),
    sentiment = sentiment_analyzer_scores(tweet)
    print("\n\t" + str(sentiment))
Our entire flight missed connections and @AmericanAir has 4 agents in Miami trying to rebook 200 people. Worst airline ever.
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-257-a983c84935fd> in <module>
      1 for sentence in tweet:
      2     print(sentence),
----> 3     sentiment = sentiment_analyzer_scores(tweet)
      4     print("\n\t" + str(sentiment))

<ipython-input-252-8b1fa62968b2> in sentiment_analyzer_scores(sentence)
      1 def sentiment_analyzer_scores(sentence):
----> 2     score = analyser.polarity_scores(sentence)
      3     print("{:-<40} {}".format(sentence, str(score)))

~/opt/anaconda3/lib/python3.7/site-packages/vaderSentiment/vaderSentiment.py in polarity_scores(self, text)
    246         """
    247         # convert emojis to their textual descriptions
--> 248         text_token_list = text.split()
    249         text_no_emoji_lst = []
    250         for token in text_token_list:

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5177             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5178                 return self[name]
-> 5179             return object.__getattribute__(self, name)
   5180 
   5181     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'split'
In [ ]: