{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "e53b68c0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /Users/navy/.kaggle/kaggle.json'\n", "nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)\n" ] } ], "source": [ "!kaggle competitions download -c nlp-getting-started" ] }, { "cell_type": "code", "execution_count": 6, "id": "1d7e380b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n", "NLP_disaster_tweet.ipynb submission.csv\r\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 7, "id": "6c6fcfb8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train: 7613, test: 3263\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...1
14NaNNaNForest fire near La Ronge Sask. Canada1
25NaNNaNAll residents asked to 'shelter in place' are ...1
36NaNNaN13,000 people receive #wildfires evacuation or...1
47NaNNaNJust got sent this photo from Ruby #Alaska as ...1
\n", "
" ], "text/plain": [ " id keyword location text \\\n", "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", "\n", " target \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from zipfile import ZipFile\n", "\n", "data_file = 'nlp-getting-started.zip'\n", "\n", "with ZipFile(data_file, 'r') as zip:\n", " df_train = pd.read_csv(zip.open('train.csv', 'r'))\n", " df_test = pd.read_csv(zip.open('test.csv', 'r'))\n", " \n", "print(f'train: {len(df_train)}, test: {len(df_test)}')\n", "df_train.head(5)" ] }, { "cell_type": "code", "execution_count": 8, "id": "381fca2f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 0])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.target.unique()" ] }, { "cell_type": "code", "execution_count": 9, "id": "7fe21d8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"What's up man?\"" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[df_train.target == 0].text.values[0]" ] }, { "cell_type": "code", "execution_count": 10, "id": "a93582f0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[df_train.target == 1].text.values[0]" ] }, { "cell_type": "markdown", "id": "aed5ef45", "metadata": {}, "source": [ "### Pre-processing" ] }, { "cell_type": "code", "execution_count": 11, "id": "d32dfa41", "metadata": {}, "outputs": [], "source": [ "import re\n", "import string\n", "\n", "def process_text(data):\n", " tx = data.apply(lambda x: re.sub(\"http\\S+\", '', str(x)))\n", " tx = tx.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))\n", " tx = tx.apply(lambda x: re.sub(' +', ' ', x))\n", " tx = tx.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x))\n", " tx = tx.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x))\n", " tx = tx.apply(lambda x: re.sub('rt', '', x))\n", " tx = tx.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))\n", " return tx" ] }, { "cell_type": "code", "execution_count": 12, "id": "93e63bb9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...1
14NaNNaNForest fire near La Ronge Sask. Canada1
25NaNNaNAll residents asked to 'shelter in place' are ...1
36NaNNaN13,000 people receive #wildfires evacuation or...1
47NaNNaNJust got sent this photo from Ruby #Alaska as ...1
\n", "
" ], "text/plain": [ " id keyword location text \\\n", "0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", "1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", "2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", "3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", "4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", "\n", " target \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head(5)" ] }, { "cell_type": "code", "execution_count": 13, "id": "02c751c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this eahquake May ...1
14NaNNaNForest fire near La Ronge Sask Canada1
25NaNNaNAll residents asked to shelter in place are be...1
36NaNNaN13000 people receive wildfires evacuation orde...1
47NaNNaNJust got sent this photo from Ruby Alaska as s...1
\n", "
" ], "text/plain": [ " id keyword location text \\\n", "0 1 NaN NaN Our Deeds are the Reason of this eahquake May ... \n", "1 4 NaN NaN Forest fire near La Ronge Sask Canada \n", "2 5 NaN NaN All residents asked to shelter in place are be... \n", "3 6 NaN NaN 13000 people receive wildfires evacuation orde... \n", "4 7 NaN NaN Just got sent this photo from Ruby Alaska as s... \n", "\n", " target \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.text = process_text(df_train.text)\n", "df_test.text = process_text(df_test.text)\n", "df_train.head(5)" ] }, { "cell_type": "markdown", "id": "494b4450", "metadata": {}, "source": [ "### Baseline: Ridge Classifier" ] }, { "cell_type": "code", "execution_count": 14, "id": "df884374", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn import feature_extraction, linear_model, model_selection, preprocessing" ] }, { "cell_type": "code", "execution_count": 15, "id": "acf1f64d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7613, 17956)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_vectorizer = feature_extraction.text.CountVectorizer()\n", "\n", "train_vectors = count_vectorizer.fit_transform(df_train.text)\n", "train_vectors.shape" ] }, { "cell_type": "code", "execution_count": 16, "id": "1554a05c", "metadata": {}, "outputs": [], "source": [ "test_vectors = count_vectorizer.transform(df_test.text)" ] }, { "cell_type": "code", "execution_count": 17, "id": "dc021e9b", "metadata": {}, "outputs": [], "source": [ "clf = linear_model.RidgeClassifier()" ] }, { "cell_type": "code", "execution_count": 18, "id": "b23833a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.6123302 , 0.55121227, 0.61226508])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores = model_selection.cross_val_score(clf, train_vectors, df_train.target,\n", " cv=3, scoring='f1')\n", "scores" ] }, { "cell_type": "markdown", "id": "44e00d9d", "metadata": {}, "source": [ "### RNN" ] }, { "cell_type": "code", "execution_count": 19, "id": "7a9f8f53", "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 63, "id": "2bb0f1e5", "metadata": {}, "outputs": [], "source": [ "VOCAB_SIZE = 5000\n", "encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(\n", " max_tokens=VOCAB_SIZE)\n", "encoder.adapt(df_train.text.values)" ] }, { "cell_type": "code", "execution_count": 64, "id": "cdaab219", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'for',\n", " 'on', 'you', 'my', 'with', 'it', 'that', 'at', 'by', 'this'],\n", " dtype='" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n", " optimizer=tf.keras.optimizers.Adam(1e-4),\n", " metrics=[F1Score()])\n", "model.fit(df_train.text.values, target, epochs=5, validation_split=0.2)" ] }, { "cell_type": "code", "execution_count": 87, "id": "32eb2742", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "238/238 [==============================] - 2s 10ms/step - loss: 0.3222 - f1_score: 0.8521\n" ] }, { "data": { "text/plain": [ "[0.3222076892852783, 0.852076530456543]" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(df_train.text.values, target)" ] }, { "cell_type": "code", "execution_count": 88, "id": "30318e77", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " text pred true\n", "31 bbcmtd Wholesale Markets ablaze 0 1\n", "56 TRUCK ABLAZE R21 VOORTREKKER AVE OUTSIDE OR T... 0 1\n", "73 BigRigRadio Live Accident Awareness 0 1\n", "81 I was in a horrible car accident this past Sun... 0 1\n", "82 Can wait to see how pissed Donnie is when I te... 1 0\n", " text pred true\n", "0 Our Deeds are the Reason of this eahquake May ... 1 1\n", "1 Forest fire near La Ronge Sask Canada 1 1\n", "2 All residents asked to shelter in place are be... 1 1\n", "3 13000 people receive wildfires evacuation orde... 1 1\n", "4 Just got sent this photo from Ruby Alaska as s... 1 1\n", " text pred true\n", "15 Whats up man 0 0\n", "16 I love fruits 0 0\n", "17 Summer is lovely 0 0\n", "18 My car is so fast 0 0\n", "19 What a goooooooaaaaaal 0 0\n" ] } ], "source": [ "y_pred = model.predict(df_train.text.values)\n", "y_pred[y_pred > 0.5] = 1\n", "y_pred = y_pred.astype('int')\n", "df_res = pd.DataFrame(data={'text': df_train.text,\n", " 'pred': y_pred.flatten(),\n", " 'true': target.flatten()})\n", "\n", "print(df_res[df_res.pred != df_res.true].head(5))\n", "print(df_res[(df_res.pred == 1) & (df_res.pred == df_res.true)].head(5))\n", "print(df_res[(df_res.pred == 0) & (df_res.pred == df_res.true)].head(5))" ] }, { "cell_type": "code", "execution_count": 89, "id": "e49184d8", "metadata": {}, "outputs": [], "source": [ "y_pred_test = model.predict(df_test.text.values)\n", "y_pred_test[y_pred_test >= 0.5] = 1\n", "y_pred_test[y_pred_test < 0.5] = 0\n", "y_pred_test = y_pred_test.astype('int')" ] }, { "cell_type": "code", "execution_count": 90, "id": "73111f57", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textpred
0Just happened a terrible car crash1
1Heard about eahquake is different cities stay ...1
2there is a forest fire at spot pond geese are ...1
3Apocalypse lighting Spokane wildfires1
4Typhoon Soudelor kills 28 in China and Taiwan1
\n", "
" ], "text/plain": [ " text pred\n", "0 Just happened a terrible car crash 1\n", "1 Heard about eahquake is different cities stay ... 1\n", "2 there is a forest fire at spot pond geese are ... 1\n", "3 Apocalypse lighting Spokane wildfires 1\n", "4 Typhoon Soudelor kills 28 in China and Taiwan 1" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_res_test = pd.DataFrame(data={'text': df_test.text,\n", " 'pred': y_pred_test.flatten()})\n", "df_res_test.head(5)" ] }, { "cell_type": "code", "execution_count": 91, "id": "7d3089c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtarget
001
121
231
391
4111
\n", "
" ], "text/plain": [ " id target\n", "0 0 1\n", "1 2 1\n", "2 3 1\n", "3 9 1\n", "4 11 1" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_submission = pd.DataFrame(data={'id': df_test.id, 'target': y_pred_test.flatten()})\n", "df_submission.head(5)" ] }, { "cell_type": "code", "execution_count": 92, "id": "9efddab8", "metadata": {}, "outputs": [], "source": [ "df_submission.to_csv('submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 93, "id": "adc10a82", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n", "NLP_disaster_tweet.ipynb submission.csv\r\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 94, "id": "303e2d39", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id,target\r\n", "0,1\r\n", "2,1\r\n", "3,1\r\n", "9,1\r\n", "11,1\r\n", "12,1\r\n", "21,0\r\n", "22,0\r\n", "27,0\r\n" ] } ], "source": [ "!head submission.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "a1306f9c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kaggle", "language": "python", "name": "kaggle" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }