In [5]:
!kaggle competitions download -c nlp-getting-started

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
!ls

NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip
NLP_disaster_tweet.ipynb       submission.csv


In [7]:
import pandas as pd
from zipfile import ZipFile

data_file = 'nlp-getting-started.zip'

with ZipFile(data_file, 'r') as zip:
    df_train = pd.read_csv(zip.open('train.csv', 'r'))
    df_test = pd.read_csv(zip.open('test.csv', 'r'))
    
print(f'train: {len(df_train)}, test: {len(df_test)}')
df_train.head(5)

train: 7613, test: 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
df_train.target.unique()

array([1, 0])

In [9]:
df_train[df_train.target == 0].text.values[0]

"What's up man?"

In [10]:
df_train[df_train.target == 1].text.values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

### Pre-processing

In [11]:
import re
import string

def process_text(data):
    tx = data.apply(lambda x: re.sub("http\S+", '', str(x)))
    tx = tx.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))
    tx = tx.apply(lambda x: re.sub(' +', ' ', x))
    tx = tx.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x))
    tx = tx.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x))
    tx = tx.apply(lambda x: re.sub('rt', '', x))
    tx = tx.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    return tx

In [12]:
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
df_train.text = process_text(df_train.text)
df_test.text = process_text(df_test.text)
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this eahquake May ...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,Just got sent this photo from Ruby Alaska as s...,1


### Baseline: Ridge Classifier

In [14]:
import numpy as np
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [15]:
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(df_train.text)
train_vectors.shape

(7613, 17956)

In [16]:
test_vectors = count_vectorizer.transform(df_test.text)

In [17]:
clf = linear_model.RidgeClassifier()

In [18]:
scores = model_selection.cross_val_score(clf, train_vectors, df_train.target,
                                         cv=3, scoring='f1')
scores

array([0.6123302 , 0.55121227, 0.61226508])

### RNN

In [19]:
import tensorflow as tf

In [63]:
VOCAB_SIZE = 5000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(df_train.text.values)

In [64]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'for',
       'on', 'you', 'my', 'with', 'it', 'that', 'at', 'by', 'this'],
      dtype='<U50')

In [65]:
target = df_train.target.values.reshape(-1, 1)

In [84]:
model = tf.keras.Sequential()
model.add(encoder)
model.add(tf.keras.layers.Embedding(len(encoder.get_vocabulary()),
                                    64,
                                    mask_zero=True))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True,
                                                             dropout=0.3, recurrent_dropout=0.3)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.3, recurrent_dropout=0.3)))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [85]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.tp = self.add_weight(name='tp',
                                  initializer='zeros')
        self.fp = self.add_weight(name='fp',
                                  initializer='zeros')
        self.fn = self.add_weight(name='fn',
                                  initializer='zeros')
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.bool)
        #cond = tf.greater(y_pred, tf.broadcast_to(0.5, y_pred.shape))
        #y_pred = tf.where(cond,
        #         tf.broadcast_to(True, y_pred.shape),
        #         tf.broadcast_to(False, y_pred.shape))
        y_pred = tf.cast(y_pred * 2, tf.int32)
        y_pred = tf.cast(y_pred, tf.bool)
        
        tp = tf.logical_and(tf.equal(y_pred, True),
                            tf.equal(y_true, True))
        fp = tf.logical_and(tf.equal(y_pred, True),
                            tf.equal(y_true, False))
        fn = tf.logical_and(tf.equal(y_pred, False),
                            tf.equal(y_true, True))
        tp = tf.cast(tp, self.dtype)
        fp = tf.cast(fp, self.dtype)
        fn = tf.cast(fn, self.dtype)
        
        #if sample_weight is not None:
        #    sample_weight = tf.cast(sample_weight, self.dtype)
        #    sample_weight = tf.broadcast_to(sample_weight, values.shape)
        #    values = tf.multiply(values, sample_weight)
            
        self.tp.assign_add(tf.reduce_sum(tp))
        self.fp.assign_add(tf.reduce_sum(fp))
        self.fn.assign_add(tf.reduce_sum(fn))
    
    def result(self):
        return (
            self.tp
            / (self.tp
               + 0.5 * (self.fp + self.fn))
        )
        

In [86]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[F1Score()])
model.fit(df_train.text.values, target, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1499ac3a0>

In [87]:
model.evaluate(df_train.text.values, target)



[0.3222076892852783, 0.852076530456543]

In [88]:
y_pred = model.predict(df_train.text.values)
y_pred[y_pred > 0.5] = 1
y_pred = y_pred.astype('int')
df_res = pd.DataFrame(data={'text': df_train.text,
                            'pred': y_pred.flatten(),
                            'true': target.flatten()})

print(df_res[df_res.pred != df_res.true].head(5))
print(df_res[(df_res.pred == 1) & (df_res.pred == df_res.true)].head(5))
print(df_res[(df_res.pred == 0) & (df_res.pred == df_res.true)].head(5))

                                                 text  pred  true
31                   bbcmtd Wholesale Markets ablaze      0     1
56  TRUCK ABLAZE  R21 VOORTREKKER AVE OUTSIDE OR T...     0     1
73                BigRigRadio Live Accident Awareness     0     1
81  I was in a horrible car accident this past Sun...     0     1
82  Can wait to see how pissed Donnie is when I te...     1     0
                                                text  pred  true
0  Our Deeds are the Reason of this eahquake May ...     1     1
1              Forest fire near La Ronge Sask Canada     1     1
2  All residents asked to shelter in place are be...     1     1
3  13000 people receive wildfires evacuation orde...     1     1
4  Just got sent this photo from Ruby Alaska as s...     1     1
                      text  pred  true
15            Whats up man     0     0
16           I love fruits     0     0
17        Summer is lovely     0     0
18       My car is so fast     0     0
19  What a gooooooo

In [89]:
y_pred_test = model.predict(df_test.text.values)
y_pred_test[y_pred_test >= 0.5] = 1
y_pred_test[y_pred_test < 0.5] = 0
y_pred_test = y_pred_test.astype('int')

In [90]:
df_res_test = pd.DataFrame(data={'text': df_test.text,
                                 'pred': y_pred_test.flatten()})
df_res_test.head(5)

Unnamed: 0,text,pred
0,Just happened a terrible car crash,1
1,Heard about eahquake is different cities stay ...,1
2,there is a forest fire at spot pond geese are ...,1
3,Apocalypse lighting Spokane wildfires,1
4,Typhoon Soudelor kills 28 in China and Taiwan,1


In [91]:
df_submission = pd.DataFrame(data={'id': df_test.id, 'target': y_pred_test.flatten()})
df_submission.head(5)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [92]:
df_submission.to_csv('submission.csv', index=False)

In [93]:
!ls

NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip
NLP_disaster_tweet.ipynb       submission.csv


In [94]:
!head submission.csv

id,target
0,1
2,1
3,1
9,1
11,1
12,1
21,0
22,0
27,0
