C3W3: Exploring Overfitting in NLP#

Sentiment140 dataset

  • 1.6 million tweets

  • sentiment (0 for negative and 4 for positive)

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import csv
import re
import string
import random

from scipy.stats import linregress
from tensorflow.keras import layers, losses
EMBEDDING_DIM = 100
SEQUENCE_LENGTH = 16
OOV_TOKEN = '[UNK]'
MAX_EXAMPLES = 160000
TRAINING_SPLIT = .9
dataset_path = '../../../data/training_cleaned.csv'
def parse_data_from_file(filename):

    sentences = []
    labels = []

    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)

        for row in reader:
            sentences.append(row[5])
            labels.append([0] if int(row[0]) == 0 else [1])

    return sentences, labels
sentences, labels = parse_data_from_file(dataset_path)
total_tweets = len(sentences)
random.seed(42)
selected_tweets = random.sample(range(total_tweets), MAX_EXAMPLES)
sentences = [sentences[i] for i in selected_tweets]
labels = [labels[i] for i in selected_tweets]
def train_val_split(sentences, labels, training_split):

    train_size = int(len(sentences) * training_split)

    train_sentences = sentences[:train_size]
    y_train = labels[:train_size]

    validation_sentences = sentences[train_size:]
    validation_labels = labels[train_size:]
    
    return train_sentences, validation_sentences, y_train, validation_labels
train_sentences, val_sentences, y_train, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)
print(f"{len(train_sentences)} sentences for training.")
print(f"{len(val_sentences)} sentences for validation.")
144000 sentences for training.
16000 sentences for validation.
def custom_standardization(input_data):
    
    text = tf.strings.lower(input_data, encoding='utf-8')
    
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    text = tf.strings.regex_replace(text, '[%s]' % re.escape(string.punctuation), ' ')

    return text
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    output_sequence_length=SEQUENCE_LENGTH)

vectorize_layer.adapt(train_sentences)

VOCAB_SIZE = len(vectorize_layer.get_vocabulary())
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label
raw_train_ds = tf.data.Dataset.from_tensor_slices((train_sentences, y_train))
raw_val_ds = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))

batch_size = 256

raw_train_ds = raw_train_ds.batch(batch_size)
raw_val_ds = raw_val_ds.batch(batch_size)

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
train_ds = train_ds.cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(tf.data.AUTOTUNE)

Using pre-defined Embeddings#

path_to_glove_file  = '../../../data/glove.6B.100d.txt'

embeddings_index = {}

with open(path_to_glove_file ) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for i, word in enumerate(vectorize_layer.get_vocabulary()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
def create_model(vocab_size, embedding_dim, sequence_length, embeddings_matrix):
    
    lstm_dim = 32
    dense_dim = 6

    model = tf.keras.Sequential([ 
        layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length, weights=[embeddings_matrix], trainable=False),
        layers.Dropout(0.2),
        layers.Bidirectional(layers.LSTM(lstm_dim)),
        layers.Dropout(0.2),
        layers.Dense(dense_dim, activation='relu'),
        layers.Dense(1)
    ])
    
    model.compile(optimizer='adam',
                  loss=losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy']) 

    return model
model = create_model(VOCAB_SIZE, EMBEDDING_DIM, SEQUENCE_LENGTH, embedding_matrix)

history = model.fit(train_ds, epochs=20, validation_data=val_ds)
Epoch 1/20
563/563 [==============================] - 13s 15ms/step - loss: 0.5827 - accuracy: 0.6637 - val_loss: 0.5333 - val_accuracy: 0.7381
Epoch 2/20
563/563 [==============================] - 7s 13ms/step - loss: 0.5325 - accuracy: 0.7122 - val_loss: 0.5171 - val_accuracy: 0.7506
Epoch 3/20
563/563 [==============================] - 7s 13ms/step - loss: 0.5173 - accuracy: 0.7248 - val_loss: 0.5001 - val_accuracy: 0.7546
Epoch 4/20
563/563 [==============================] - 7s 13ms/step - loss: 0.5062 - accuracy: 0.7328 - val_loss: 0.5020 - val_accuracy: 0.7622
Epoch 5/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4978 - accuracy: 0.7391 - val_loss: 0.4884 - val_accuracy: 0.7619
Epoch 6/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4930 - accuracy: 0.7430 - val_loss: 0.4923 - val_accuracy: 0.7646
Epoch 7/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4860 - accuracy: 0.7473 - val_loss: 0.4891 - val_accuracy: 0.7678
Epoch 8/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4833 - accuracy: 0.7508 - val_loss: 0.4911 - val_accuracy: 0.7694
Epoch 9/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4788 - accuracy: 0.7536 - val_loss: 0.4883 - val_accuracy: 0.7682
Epoch 10/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4755 - accuracy: 0.7558 - val_loss: 0.4841 - val_accuracy: 0.7716
Epoch 11/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4725 - accuracy: 0.7581 - val_loss: 0.4772 - val_accuracy: 0.7727
Epoch 12/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4697 - accuracy: 0.7603 - val_loss: 0.4820 - val_accuracy: 0.7739
Epoch 13/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4669 - accuracy: 0.7615 - val_loss: 0.4710 - val_accuracy: 0.7720
Epoch 14/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4644 - accuracy: 0.7641 - val_loss: 0.4747 - val_accuracy: 0.7756
Epoch 15/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4621 - accuracy: 0.7661 - val_loss: 0.4755 - val_accuracy: 0.7757
Epoch 16/20
563/563 [==============================] - 7s 12ms/step - loss: 0.4594 - accuracy: 0.7673 - val_loss: 0.4741 - val_accuracy: 0.7765
Epoch 17/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4583 - accuracy: 0.7673 - val_loss: 0.4702 - val_accuracy: 0.7764
Epoch 18/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4559 - accuracy: 0.7705 - val_loss: 0.4685 - val_accuracy: 0.7764
Epoch 19/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4547 - accuracy: 0.7704 - val_loss: 0.4673 - val_accuracy: 0.7754
Epoch 20/20
563/563 [==============================] - 7s 13ms/step - loss: 0.4523 - accuracy: 0.7715 - val_loss: 0.4683 - val_accuracy: 0.7801

Notes

    lstm_dim = 32
    dense_dim = 6

    model = tf.keras.Sequential([ 
        ...
        layers.Dropout(0.2),
        layers.Bidirectional(layers.LSTM(lstm_dim)),
        layers.Dropout(0.2),
        layers.Dense(dense_dim, activation='relu'),
        layers.Dense(1)
    ])

loss: 0.4602 - accuracy: 0.7667 - val_loss: 0.4707 - val_accuracy: 0.7747

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training y Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Training loss')
plt.plot(val_loss, label='Validation loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
../../_images/c3w3_overfitting_nlp_22_0.png
epochs = [*range(20)]
slope, *_ = linregress(epochs, val_loss)
print(f"The slope of your validation loss curve is {slope:.5f}")
The slope of your validation loss curve is -0.00264