{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "e53b68c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /Users/navy/.kaggle/kaggle.json'\n",
"nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle competitions download -c nlp-getting-started"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1d7e380b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n",
"NLP_disaster_tweet.ipynb submission.csv\r\n"
]
}
],
"source": [
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6c6fcfb8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train: 7613, test: 3263\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" keyword | \n",
" location | \n",
" text | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" Our Deeds are the Reason of this #earthquake M... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" NaN | \n",
" NaN | \n",
" Forest fire near La Ronge Sask. Canada | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" All residents asked to 'shelter in place' are ... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
" 13,000 people receive #wildfires evacuation or... | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" Just got sent this photo from Ruby #Alaska as ... | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id keyword location text \\\n",
"0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n",
"1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n",
"2 5 NaN NaN All residents asked to 'shelter in place' are ... \n",
"3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n",
"4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n",
"\n",
" target \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from zipfile import ZipFile\n",
"\n",
"data_file = 'nlp-getting-started.zip'\n",
"\n",
"with ZipFile(data_file, 'r') as zip:\n",
" df_train = pd.read_csv(zip.open('train.csv', 'r'))\n",
" df_test = pd.read_csv(zip.open('test.csv', 'r'))\n",
" \n",
"print(f'train: {len(df_train)}, test: {len(df_test)}')\n",
"df_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "381fca2f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 0])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.target.unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7fe21d8a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"What's up man?\""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train[df_train.target == 0].text.values[0]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a93582f0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train[df_train.target == 1].text.values[0]"
]
},
{
"cell_type": "markdown",
"id": "aed5ef45",
"metadata": {},
"source": [
"### Pre-processing"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d32dfa41",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import string\n",
"\n",
"def process_text(data):\n",
" tx = data.apply(lambda x: re.sub(\"http\\S+\", '', str(x)))\n",
" tx = tx.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '',x))\n",
" tx = tx.apply(lambda x: re.sub(' +', ' ', x))\n",
" tx = tx.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', x))\n",
" tx = tx.apply(lambda x: re.sub('(@[A-Za-z]+[A-za-z0-9-_]+)', '', x))\n",
" tx = tx.apply(lambda x: re.sub('rt', '', x))\n",
" tx = tx.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))\n",
" return tx"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "93e63bb9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" keyword | \n",
" location | \n",
" text | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" Our Deeds are the Reason of this #earthquake M... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" NaN | \n",
" NaN | \n",
" Forest fire near La Ronge Sask. Canada | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" All residents asked to 'shelter in place' are ... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
" 13,000 people receive #wildfires evacuation or... | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" Just got sent this photo from Ruby #Alaska as ... | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id keyword location text \\\n",
"0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n",
"1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n",
"2 5 NaN NaN All residents asked to 'shelter in place' are ... \n",
"3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n",
"4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n",
"\n",
" target \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "02c751c0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" keyword | \n",
" location | \n",
" text | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" Our Deeds are the Reason of this eahquake May ... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" NaN | \n",
" NaN | \n",
" Forest fire near La Ronge Sask Canada | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" All residents asked to shelter in place are be... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
" 13000 people receive wildfires evacuation orde... | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" Just got sent this photo from Ruby Alaska as s... | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id keyword location text \\\n",
"0 1 NaN NaN Our Deeds are the Reason of this eahquake May ... \n",
"1 4 NaN NaN Forest fire near La Ronge Sask Canada \n",
"2 5 NaN NaN All residents asked to shelter in place are be... \n",
"3 6 NaN NaN 13000 people receive wildfires evacuation orde... \n",
"4 7 NaN NaN Just got sent this photo from Ruby Alaska as s... \n",
"\n",
" target \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.text = process_text(df_train.text)\n",
"df_test.text = process_text(df_test.text)\n",
"df_train.head(5)"
]
},
{
"cell_type": "markdown",
"id": "494b4450",
"metadata": {},
"source": [
"### Baseline: Ridge Classifier"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "df884374",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn import feature_extraction, linear_model, model_selection, preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "acf1f64d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7613, 17956)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_vectorizer = feature_extraction.text.CountVectorizer()\n",
"\n",
"train_vectors = count_vectorizer.fit_transform(df_train.text)\n",
"train_vectors.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1554a05c",
"metadata": {},
"outputs": [],
"source": [
"test_vectors = count_vectorizer.transform(df_test.text)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "dc021e9b",
"metadata": {},
"outputs": [],
"source": [
"clf = linear_model.RidgeClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b23833a9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.6123302 , 0.55121227, 0.61226508])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores = model_selection.cross_val_score(clf, train_vectors, df_train.target,\n",
" cv=3, scoring='f1')\n",
"scores"
]
},
{
"cell_type": "markdown",
"id": "44e00d9d",
"metadata": {},
"source": [
"### RNN"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "7a9f8f53",
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "2bb0f1e5",
"metadata": {},
"outputs": [],
"source": [
"VOCAB_SIZE = 5000\n",
"encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(\n",
" max_tokens=VOCAB_SIZE)\n",
"encoder.adapt(df_train.text.values)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "cdaab219",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'for',\n",
" 'on', 'you', 'my', 'with', 'it', 'that', 'at', 'by', 'this'],\n",
" dtype='"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),\n",
" optimizer=tf.keras.optimizers.Adam(1e-4),\n",
" metrics=[F1Score()])\n",
"model.fit(df_train.text.values, target, epochs=5, validation_split=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "32eb2742",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"238/238 [==============================] - 2s 10ms/step - loss: 0.3222 - f1_score: 0.8521\n"
]
},
{
"data": {
"text/plain": [
"[0.3222076892852783, 0.852076530456543]"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.evaluate(df_train.text.values, target)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "30318e77",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" text pred true\n",
"31 bbcmtd Wholesale Markets ablaze 0 1\n",
"56 TRUCK ABLAZE R21 VOORTREKKER AVE OUTSIDE OR T... 0 1\n",
"73 BigRigRadio Live Accident Awareness 0 1\n",
"81 I was in a horrible car accident this past Sun... 0 1\n",
"82 Can wait to see how pissed Donnie is when I te... 1 0\n",
" text pred true\n",
"0 Our Deeds are the Reason of this eahquake May ... 1 1\n",
"1 Forest fire near La Ronge Sask Canada 1 1\n",
"2 All residents asked to shelter in place are be... 1 1\n",
"3 13000 people receive wildfires evacuation orde... 1 1\n",
"4 Just got sent this photo from Ruby Alaska as s... 1 1\n",
" text pred true\n",
"15 Whats up man 0 0\n",
"16 I love fruits 0 0\n",
"17 Summer is lovely 0 0\n",
"18 My car is so fast 0 0\n",
"19 What a goooooooaaaaaal 0 0\n"
]
}
],
"source": [
"y_pred = model.predict(df_train.text.values)\n",
"y_pred[y_pred > 0.5] = 1\n",
"y_pred = y_pred.astype('int')\n",
"df_res = pd.DataFrame(data={'text': df_train.text,\n",
" 'pred': y_pred.flatten(),\n",
" 'true': target.flatten()})\n",
"\n",
"print(df_res[df_res.pred != df_res.true].head(5))\n",
"print(df_res[(df_res.pred == 1) & (df_res.pred == df_res.true)].head(5))\n",
"print(df_res[(df_res.pred == 0) & (df_res.pred == df_res.true)].head(5))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "e49184d8",
"metadata": {},
"outputs": [],
"source": [
"y_pred_test = model.predict(df_test.text.values)\n",
"y_pred_test[y_pred_test >= 0.5] = 1\n",
"y_pred_test[y_pred_test < 0.5] = 0\n",
"y_pred_test = y_pred_test.astype('int')"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "73111f57",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" pred | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Just happened a terrible car crash | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" Heard about eahquake is different cities stay ... | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" there is a forest fire at spot pond geese are ... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" Apocalypse lighting Spokane wildfires | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" Typhoon Soudelor kills 28 in China and Taiwan | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text pred\n",
"0 Just happened a terrible car crash 1\n",
"1 Heard about eahquake is different cities stay ... 1\n",
"2 there is a forest fire at spot pond geese are ... 1\n",
"3 Apocalypse lighting Spokane wildfires 1\n",
"4 Typhoon Soudelor kills 28 in China and Taiwan 1"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_res_test = pd.DataFrame(data={'text': df_test.text,\n",
" 'pred': y_pred_test.flatten()})\n",
"df_res_test.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "7d3089c8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 11 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id target\n",
"0 0 1\n",
"1 2 1\n",
"2 3 1\n",
"3 9 1\n",
"4 11 1"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_submission = pd.DataFrame(data={'id': df_test.id, 'target': y_pred_test.flatten()})\n",
"df_submission.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "9efddab8",
"metadata": {},
"outputs": [],
"source": [
"df_submission.to_csv('submission.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "adc10a82",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NLP_disaster_tweet-Copy1.ipynb nlp-getting-started.zip\r\n",
"NLP_disaster_tweet.ipynb submission.csv\r\n"
]
}
],
"source": [
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "303e2d39",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id,target\r\n",
"0,1\r\n",
"2,1\r\n",
"3,1\r\n",
"9,1\r\n",
"11,1\r\n",
"12,1\r\n",
"21,0\r\n",
"22,0\r\n",
"27,0\r\n"
]
}
],
"source": [
"!head submission.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1306f9c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "kaggle",
"language": "python",
"name": "kaggle"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}