{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e53b68c0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /Users/navy/.kaggle/kaggle.json'\n",
      "nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
     ]
    }
   ],
   "source": [
    "!kaggle competitions download -c nlp-getting-started"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1d7e380b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n",
      "NLP_disaster_tweet.ipynb       submission.csv\r\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6c6fcfb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: 7613, test: 3263\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>keyword</th>\n",
       "      <th>location</th>\n",
       "      <th>text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Our Deeds are the Reason of this #earthquake M...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Forest fire near La Ronge Sask. Canada</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>All residents asked to 'shelter in place' are ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13,000 people receive #wildfires evacuation or...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Just got sent this photo from Ruby #Alaska as ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id keyword location                                               text  \\\n",
       "0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   \n",
       "1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   \n",
       "2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   \n",
       "3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   \n",
       "4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   \n",
       "\n",
       "   target  \n",
       "0       1  \n",
       "1       1  \n",
       "2       1  \n",
       "3       1  \n",
       "4       1  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from zipfile import ZipFile\n",
    "\n",
    "data_file = 'nlp-getting-started.zip'\n",
    "\n",
    "with ZipFile(data_file, 'r') as zip:\n",
    "    df_train = pd.read_csv(zip.open('train.csv', 'r'))\n",
    "    df_test = pd.read_csv(zip.open('test.csv', 'r'))\n",
    "    \n",
    "print(f'train: {len(df_train)}, test: {len(df_test)}')\n",
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "381fca2f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.target.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7fe21d8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"What's up man?\""
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train[df_train.target == 0].text.values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a93582f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train[df_train.target == 1].text.values[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aed5ef45",
   "metadata": {},
   "source": [
    "### Pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d32dfa41",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import string\n",
    "\n",
    "def process_text(data):\n",
    "    tx = data.apply(lambda x: re.sub(\"http\\S+\", '', str(x)))\n",
    "    tx = tx.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))\n",
    "    tx = tx.apply(lambda x: re.sub(' +', ' ', x))\n",
    "    tx = tx.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x))\n",
    "    tx = tx.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x))\n",
    "    tx = tx.apply(lambda x: re.sub('rt', '', x))\n",
    "    tx = tx.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))\n",
    "    return tx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "93e63bb9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>keyword</th>\n",
       "      <th>location</th>\n",
       "      <th>text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Our Deeds are the Reason of this #earthquake M...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Forest fire near La Ronge Sask. Canada</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>All residents asked to 'shelter in place' are ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13,000 people receive #wildfires evacuation or...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Just got sent this photo from Ruby #Alaska as ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id keyword location                                               text  \\\n",
       "0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   \n",
       "1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   \n",
       "2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   \n",
       "3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   \n",
       "4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   \n",
       "\n",
       "   target  \n",
       "0       1  \n",
       "1       1  \n",
       "2       1  \n",
       "3       1  \n",
       "4       1  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "02c751c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>keyword</th>\n",
       "      <th>location</th>\n",
       "      <th>text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Our Deeds are the Reason of this eahquake May ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Forest fire near La Ronge Sask Canada</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>All residents asked to shelter in place are be...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13000 people receive wildfires evacuation orde...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Just got sent this photo from Ruby Alaska as s...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id keyword location                                               text  \\\n",
       "0   1     NaN      NaN  Our Deeds are the Reason of this eahquake May ...   \n",
       "1   4     NaN      NaN              Forest fire near La Ronge Sask Canada   \n",
       "2   5     NaN      NaN  All residents asked to shelter in place are be...   \n",
       "3   6     NaN      NaN  13000 people receive wildfires evacuation orde...   \n",
       "4   7     NaN      NaN  Just got sent this photo from Ruby Alaska as s...   \n",
       "\n",
       "   target  \n",
       "0       1  \n",
       "1       1  \n",
       "2       1  \n",
       "3       1  \n",
       "4       1  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.text = process_text(df_train.text)\n",
    "df_test.text = process_text(df_test.text)\n",
    "df_train.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "494b4450",
   "metadata": {},
   "source": [
    "### Baseline: Ridge Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "df884374",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import feature_extraction, linear_model, model_selection, preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "acf1f64d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7613, 17956)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_vectorizer = feature_extraction.text.CountVectorizer()\n",
    "\n",
    "train_vectors = count_vectorizer.fit_transform(df_train.text)\n",
    "train_vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1554a05c",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_vectors = count_vectorizer.transform(df_test.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "dc021e9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = linear_model.RidgeClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "b23833a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.6123302 , 0.55121227, 0.61226508])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = model_selection.cross_val_score(clf, train_vectors, df_train.target,\n",
    "                                         cv=3, scoring='f1')\n",
    "scores"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44e00d9d",
   "metadata": {},
   "source": [
    "### RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7a9f8f53",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "2bb0f1e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCAB_SIZE = 5000\n",
    "encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(\n",
    "    max_tokens=VOCAB_SIZE)\n",
    "encoder.adapt(df_train.text.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "cdaab219",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'for',\n",
       "       'on', 'you', 'my', 'with', 'it', 'that', 'at', 'by', 'this'],\n",
       "      dtype='<U50')"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = np.array(encoder.get_vocabulary())\n",
    "vocab[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "eee1a649",
   "metadata": {},
   "outputs": [],
   "source": [
    "target = df_train.target.values.reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "23975893",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = tf.keras.Sequential()\n",
    "model.add(encoder)\n",
    "model.add(tf.keras.layers.Embedding(len(encoder.get_vocabulary()),\n",
    "                                    64,\n",
    "                                    mask_zero=True))\n",
    "model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True,\n",
    "                                                             dropout=0.3, recurrent_dropout=0.3)))\n",
    "model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.3, recurrent_dropout=0.3)))\n",
    "model.add(tf.keras.layers.Dense(64, activation='relu'))\n",
    "model.add(tf.keras.layers.Dense(1, activation='sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "29ddb997",
   "metadata": {},
   "outputs": [],
   "source": [
    "class F1Score(tf.keras.metrics.Metric):\n",
    "    def __init__(self, name='f1_score', **kwargs):\n",
    "        super(F1Score, self).__init__(name=name, **kwargs)\n",
    "        self.tp = self.add_weight(name='tp',\n",
    "                                  initializer='zeros')\n",
    "        self.fp = self.add_weight(name='fp',\n",
    "                                  initializer='zeros')\n",
    "        self.fn = self.add_weight(name='fn',\n",
    "                                  initializer='zeros')\n",
    "    \n",
    "    def update_state(self, y_true, y_pred, sample_weight=None):\n",
    "        y_true = tf.cast(y_true, tf.bool)\n",
    "        #cond = tf.greater(y_pred, tf.broadcast_to(0.5, y_pred.shape))\n",
    "        #y_pred = tf.where(cond,\n",
    "        #         tf.broadcast_to(True, y_pred.shape),\n",
    "        #         tf.broadcast_to(False, y_pred.shape))\n",
    "        y_pred = tf.cast(y_pred * 2, tf.int32)\n",
    "        y_pred = tf.cast(y_pred, tf.bool)\n",
    "        \n",
    "        tp = tf.logical_and(tf.equal(y_pred, True),\n",
    "                            tf.equal(y_true, True))\n",
    "        fp = tf.logical_and(tf.equal(y_pred, True),\n",
    "                            tf.equal(y_true, False))\n",
    "        fn = tf.logical_and(tf.equal(y_pred, False),\n",
    "                            tf.equal(y_true, True))\n",
    "        tp = tf.cast(tp, self.dtype)\n",
    "        fp = tf.cast(fp, self.dtype)\n",
    "        fn = tf.cast(fn, self.dtype)\n",
    "        \n",
    "        #if sample_weight is not None:\n",
    "        #    sample_weight = tf.cast(sample_weight, self.dtype)\n",
    "        #    sample_weight = tf.broadcast_to(sample_weight, values.shape)\n",
    "        #    values = tf.multiply(values, sample_weight)\n",
    "            \n",
    "        self.tp.assign_add(tf.reduce_sum(tp))\n",
    "        self.fp.assign_add(tf.reduce_sum(fp))\n",
    "        self.fn.assign_add(tf.reduce_sum(fn))\n",
    "    \n",
    "    def result(self):\n",
    "        return (\n",
    "            self.tp\n",
    "            / (self.tp\n",
    "               + 0.5 * (self.fp + self.fn))\n",
    "        )\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "41cc4179",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "191/191 [==============================] - 25s 76ms/step - loss: 0.6864 - f1_score: 0.0116 - val_loss: 0.6727 - val_f1_score: 0.0028\n",
      "Epoch 2/5\n",
      "191/191 [==============================] - 13s 70ms/step - loss: 0.6175 - f1_score: 0.3803 - val_loss: 0.5222 - val_f1_score: 0.7369\n",
      "Epoch 3/5\n",
      "191/191 [==============================] - 14s 74ms/step - loss: 0.4309 - f1_score: 0.7799 - val_loss: 0.4810 - val_f1_score: 0.7479\n",
      "Epoch 4/5\n",
      "191/191 [==============================] - 16s 84ms/step - loss: 0.3614 - f1_score: 0.8244 - val_loss: 0.4732 - val_f1_score: 0.7618\n",
      "Epoch 5/5\n",
      "191/191 [==============================] - 15s 77ms/step - loss: 0.3205 - f1_score: 0.8432 - val_loss: 0.5007 - val_f1_score: 0.7544\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x1499ac3a0>"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n",
    "              optimizer=tf.keras.optimizers.Adam(1e-4),\n",
    "              metrics=[F1Score()])\n",
    "model.fit(df_train.text.values, target, epochs=5, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "32eb2742",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "238/238 [==============================] - 2s 10ms/step - loss: 0.3222 - f1_score: 0.8521\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.3222076892852783, 0.852076530456543]"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.evaluate(df_train.text.values, target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "30318e77",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                 text  pred  true\n",
      "31                   bbcmtd Wholesale Markets ablaze      0     1\n",
      "56  TRUCK ABLAZE  R21 VOORTREKKER AVE OUTSIDE OR T...     0     1\n",
      "73                BigRigRadio Live Accident Awareness     0     1\n",
      "81  I was in a horrible car accident this past Sun...     0     1\n",
      "82  Can wait to see how pissed Donnie is when I te...     1     0\n",
      "                                                text  pred  true\n",
      "0  Our Deeds are the Reason of this eahquake May ...     1     1\n",
      "1              Forest fire near La Ronge Sask Canada     1     1\n",
      "2  All residents asked to shelter in place are be...     1     1\n",
      "3  13000 people receive wildfires evacuation orde...     1     1\n",
      "4  Just got sent this photo from Ruby Alaska as s...     1     1\n",
      "                      text  pred  true\n",
      "15            Whats up man     0     0\n",
      "16           I love fruits     0     0\n",
      "17        Summer is lovely     0     0\n",
      "18       My car is so fast     0     0\n",
      "19  What a goooooooaaaaaal     0     0\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(df_train.text.values)\n",
    "y_pred[y_pred > 0.5] = 1\n",
    "y_pred = y_pred.astype('int')\n",
    "df_res = pd.DataFrame(data={'text': df_train.text,\n",
    "                            'pred': y_pred.flatten(),\n",
    "                            'true': target.flatten()})\n",
    "\n",
    "print(df_res[df_res.pred != df_res.true].head(5))\n",
    "print(df_res[(df_res.pred == 1) & (df_res.pred == df_res.true)].head(5))\n",
    "print(df_res[(df_res.pred == 0) & (df_res.pred == df_res.true)].head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "e49184d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_test = model.predict(df_test.text.values)\n",
    "y_pred_test[y_pred_test >= 0.5] = 1\n",
    "y_pred_test[y_pred_test < 0.5] = 0\n",
    "y_pred_test = y_pred_test.astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "73111f57",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>pred</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Just happened a terrible car crash</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Heard about eahquake is different cities stay ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>there is a forest fire at spot pond geese are ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Apocalypse lighting Spokane wildfires</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Typhoon Soudelor kills 28 in China and Taiwan</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  pred\n",
       "0                 Just happened a terrible car crash     1\n",
       "1  Heard about eahquake is different cities stay ...     1\n",
       "2  there is a forest fire at spot pond geese are ...     1\n",
       "3              Apocalypse lighting Spokane wildfires     1\n",
       "4      Typhoon Soudelor kills 28 in China and Taiwan     1"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_res_test = pd.DataFrame(data={'text': df_test.text,\n",
    "                                 'pred': y_pred_test.flatten()})\n",
    "df_res_test.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "7d3089c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  target\n",
       "0   0       1\n",
       "1   2       1\n",
       "2   3       1\n",
       "3   9       1\n",
       "4  11       1"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_submission = pd.DataFrame(data={'id': df_test.id, 'target': y_pred_test.flatten()})\n",
    "df_submission.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "9efddab8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submission.to_csv('submission.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "adc10a82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n",
      "NLP_disaster_tweet.ipynb       submission.csv\r\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "303e2d39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id,target\r\n",
      "0,1\r\n",
      "2,1\r\n",
      "3,1\r\n",
      "9,1\r\n",
      "11,1\r\n",
      "12,1\r\n",
      "21,0\r\n",
      "22,0\r\n",
      "27,0\r\n"
     ]
    }
   ],
   "source": [
    "!head submission.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1306f9c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kaggle",
   "language": "python",
   "name": "kaggle"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}