C3W4: Predicting the next word#

Shakespeare’s sonnets

2000 lines of text extracted from Shakespeare’s sonnets

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras import layers, losses
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

dataset_path = '../../../data/sonnets.txt'

with open(dataset_path) as f:
    data = f.read()

corpus = data.lower().split('\n')

print(f'{len(corpus)} lines of sonnetts')

2159 lines of sonnetts

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)

def n_gram_seqs(corpus, tokenizer):

    input_sequences = []

    for line in corpus:
        seq = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, len(seq)):
            input_sequences.append(seq[:i+1])
    
    return input_sequences

first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)

print("n_gram sequences for first example look like this:")
first_example_sequence

n_gram sequences for first example look like this:

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517]]

next_3_examples_sequence = n_gram_seqs(corpus[1:4], tokenizer)

print("n_gram sequences for next 3 examples look like this:")
next_3_examples_sequence

n_gram sequences for next 3 examples look like this:

[[8, 878],
 [8, 878, 134],
 [8, 878, 134, 351],
 [8, 878, 134, 351, 102],
 [8, 878, 134, 351, 102, 156],
 [8, 878, 134, 351, 102, 156, 199],
 [16, 22],
 [16, 22, 2],
 [16, 22, 2, 879],
 [16, 22, 2, 879, 61],
 [16, 22, 2, 879, 61, 30],
 [16, 22, 2, 879, 61, 30, 48],
 [16, 22, 2, 879, 61, 30, 48, 634],
 [25, 311],
 [25, 311, 635],
 [25, 311, 635, 102],
 [25, 311, 635, 102, 200],
 [25, 311, 635, 102, 200, 25],
 [25, 311, 635, 102, 200, 25, 278]]

input_sequences = n_gram_seqs(corpus, tokenizer)
max_sequence_len = max([len(x) for x in input_sequences])

print(f"len(input_sequences): {len(input_sequences)}")
print(f"max_sequence_len: {max_sequence_len}")

len(input_sequences): 15462
max_sequence_len: 11

def pad_seqs(input_sequences, maxlen):
    
    padded_sequences = pad_sequences(input_sequences, maxlen=maxlen)
    
    return padded_sequences

first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq

array([[  0,   0,   0,  34, 417],
       [  0,   0,  34, 417, 877],
       [  0,  34, 417, 877, 166],
       [ 34, 417, 877, 166, 213],
       [417, 877, 166, 213, 517]])

max_sequence_len_3 = max([len(s) for s in next_3_examples_sequence])
next_3_padded_seq = pad_seqs(next_3_examples_sequence, max_sequence_len_3)
next_3_padded_seq

array([[  0,   0,   0,   0,   0,   0,   8, 878],
       [  0,   0,   0,   0,   0,   8, 878, 134],
       [  0,   0,   0,   0,   8, 878, 134, 351],
       [  0,   0,   0,   8, 878, 134, 351, 102],
       [  0,   0,   8, 878, 134, 351, 102, 156],
       [  0,   8, 878, 134, 351, 102, 156, 199],
       [  0,   0,   0,   0,   0,   0,  16,  22],
       [  0,   0,   0,   0,   0,  16,  22,   2],
       [  0,   0,   0,   0,  16,  22,   2, 879],
       [  0,   0,   0,  16,  22,   2, 879,  61],
       [  0,   0,  16,  22,   2, 879,  61,  30],
       [  0,  16,  22,   2, 879,  61,  30,  48],
       [ 16,  22,   2, 879,  61,  30,  48, 634],
       [  0,   0,   0,   0,   0,   0,  25, 311],
       [  0,   0,   0,   0,   0,  25, 311, 635],
       [  0,   0,   0,   0,  25, 311, 635, 102],
       [  0,   0,   0,  25, 311, 635, 102, 200],
       [  0,   0,  25, 311, 635, 102, 200,  25],
       [  0,  25, 311, 635, 102, 200,  25, 278]])

input_sequences = pad_seqs(input_sequences, max_sequence_len)

print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (15462, 11)

def features_and_labels(input_sequences, total_words):
    
    list_features = []
    labels = []

    for seq in input_sequences:
        list_features.append(seq[:-1])
        labels.append(seq[-1] - 1)

    features = np.array(list_features)
    one_hot_labels = to_categorical(labels, num_classes=total_words)

    return features, one_hot_labels

first_features, first_labels = features_and_labels(first_padded_seq, total_words)

print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features

labels have shape: (5, 3210)

features look like this:

array([[  0,   0,   0,  34],
       [  0,   0,  34, 417],
       [  0,  34, 417, 877],
       [ 34, 417, 877, 166],
       [417, 877, 166, 213]])

features, labels = features_and_labels(input_sequences, total_words)

print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

features have shape: (15462, 10)
labels have shape: (15462, 3210)

def create_model(total_words, max_sequence_len):

    embedding_dim = 100
    lstm_dim = 128

    model = tf.keras.Sequential([
        layers.Embedding(total_words, embedding_dim, input_length=max_sequence_len),
        layers.Bidirectional(layers.LSTM(lstm_dim)),
        layers.Dense(total_words)
    ])

    model.compile(optimizer='adam',
                  loss=losses.CategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

model = create_model(total_words, max_sequence_len-1)

history = model.fit(features, labels, epochs=50, verbose=1)

Epoch 1/50
484/484 [==============================] - 11s 12ms/step - loss: 6.8855 - accuracy: 0.0238
Epoch 2/50
484/484 [==============================] - 5s 11ms/step - loss: 6.4448 - accuracy: 0.0325
Epoch 3/50
484/484 [==============================] - 6s 12ms/step - loss: 6.2318 - accuracy: 0.0400
Epoch 4/50
484/484 [==============================] - 5s 11ms/step - loss: 5.9839 - accuracy: 0.0455
Epoch 5/50
484/484 [==============================] - 5s 11ms/step - loss: 5.7049 - accuracy: 0.0604
Epoch 6/50
484/484 [==============================] - 5s 11ms/step - loss: 5.3748 - accuracy: 0.0739
Epoch 7/50
484/484 [==============================] - 5s 10ms/step - loss: 5.0123 - accuracy: 0.0924
Epoch 8/50
484/484 [==============================] - 5s 10ms/step - loss: 4.6417 - accuracy: 0.1150
Epoch 9/50
484/484 [==============================] - 5s 10ms/step - loss: 4.2685 - accuracy: 0.1535
Epoch 10/50
484/484 [==============================] - 5s 10ms/step - loss: 3.8965 - accuracy: 0.2056
Epoch 11/50
484/484 [==============================] - 6s 11ms/step - loss: 3.5550 - accuracy: 0.2654
Epoch 12/50
484/484 [==============================] - 5s 11ms/step - loss: 3.2320 - accuracy: 0.3252
Epoch 13/50
484/484 [==============================] - 6s 11ms/step - loss: 2.9419 - accuracy: 0.3773
Epoch 14/50
484/484 [==============================] - 5s 11ms/step - loss: 2.6818 - accuracy: 0.4324
Epoch 15/50
484/484 [==============================] - 5s 11ms/step - loss: 2.4436 - accuracy: 0.4783
Epoch 16/50
484/484 [==============================] - 6s 12ms/step - loss: 2.2404 - accuracy: 0.5184
Epoch 17/50
484/484 [==============================] - 6s 13ms/step - loss: 2.0522 - accuracy: 0.5605
Epoch 18/50
484/484 [==============================] - 6s 13ms/step - loss: 1.8852 - accuracy: 0.5995
Epoch 19/50
484/484 [==============================] - 7s 14ms/step - loss: 1.7328 - accuracy: 0.6348
Epoch 20/50
484/484 [==============================] - 7s 14ms/step - loss: 1.5934 - accuracy: 0.6641
Epoch 21/50
484/484 [==============================] - 7s 14ms/step - loss: 1.4765 - accuracy: 0.6908
Epoch 22/50
484/484 [==============================] - 7s 14ms/step - loss: 1.3572 - accuracy: 0.7182
Epoch 23/50
484/484 [==============================] - 7s 15ms/step - loss: 1.2559 - accuracy: 0.7386
Epoch 24/50
484/484 [==============================] - 7s 15ms/step - loss: 1.1739 - accuracy: 0.7542
Epoch 25/50
484/484 [==============================] - 7s 14ms/step - loss: 1.0938 - accuracy: 0.7704
Epoch 26/50
484/484 [==============================] - 7s 14ms/step - loss: 1.0224 - accuracy: 0.7861
Epoch 27/50
484/484 [==============================] - 7s 14ms/step - loss: 0.9632 - accuracy: 0.7994
Epoch 28/50
484/484 [==============================] - 7s 14ms/step - loss: 0.9088 - accuracy: 0.8072
Epoch 29/50
484/484 [==============================] - 6s 13ms/step - loss: 0.8611 - accuracy: 0.8152
Epoch 30/50
484/484 [==============================] - 7s 14ms/step - loss: 0.8203 - accuracy: 0.8221
Epoch 31/50
484/484 [==============================] - 7s 14ms/step - loss: 0.7882 - accuracy: 0.8272
Epoch 32/50
484/484 [==============================] - 7s 14ms/step - loss: 0.7572 - accuracy: 0.8291
Epoch 33/50
484/484 [==============================] - 6s 13ms/step - loss: 0.7308 - accuracy: 0.8359
Epoch 34/50
484/484 [==============================] - 6s 13ms/step - loss: 0.7040 - accuracy: 0.8366
Epoch 35/50
484/484 [==============================] - 7s 13ms/step - loss: 0.6881 - accuracy: 0.8403
Epoch 36/50
484/484 [==============================] - 6s 13ms/step - loss: 0.6729 - accuracy: 0.8407
Epoch 37/50
484/484 [==============================] - 7s 14ms/step - loss: 0.6577 - accuracy: 0.8430
Epoch 38/50
484/484 [==============================] - 7s 14ms/step - loss: 0.6424 - accuracy: 0.8444
Epoch 39/50
484/484 [==============================] - 7s 14ms/step - loss: 0.6399 - accuracy: 0.8443
Epoch 40/50
484/484 [==============================] - 7s 15ms/step - loss: 0.6332 - accuracy: 0.8445
Epoch 41/50
484/484 [==============================] - 7s 14ms/step - loss: 0.6161 - accuracy: 0.8465
Epoch 42/50
484/484 [==============================] - 8s 16ms/step - loss: 0.6089 - accuracy: 0.8470
Epoch 43/50
484/484 [==============================] - 6s 13ms/step - loss: 0.6022 - accuracy: 0.8453
Epoch 44/50
484/484 [==============================] - 7s 15ms/step - loss: 0.6002 - accuracy: 0.8468
Epoch 45/50
484/484 [==============================] - 6s 13ms/step - loss: 0.5906 - accuracy: 0.8479
Epoch 46/50
484/484 [==============================] - 6s 13ms/step - loss: 0.5897 - accuracy: 0.8479
Epoch 47/50
484/484 [==============================] - 7s 14ms/step - loss: 0.5813 - accuracy: 0.8491
Epoch 48/50
484/484 [==============================] - 7s 14ms/step - loss: 0.5825 - accuracy: 0.8478
Epoch 49/50
484/484 [==============================] - 7s 14ms/step - loss: 0.5796 - accuracy: 0.8476
Epoch 50/50
484/484 [==============================] - 6s 13ms/step - loss: 0.5701 - accuracy: 0.8489

acc = history.history['accuracy']
loss = history.history['loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc)
plt.title('Training Accuracy')

plt.subplot(1, 2, 2)
plt.plot(loss)
plt.title('Training Loss')
plt.show()

../../_images/c3w4_predicting_next_word_18_0.png

seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict(token_list, verbose=0)
	predicted = np.argmax(predicted, axis=-1).item() + 1
	output_word = tokenizer.index_word[predicted]
	seed_text += " " + output_word

print(seed_text)

Help me Obi Wan Kenobi, you're my only hope of good antique part place of me are dead dead or give none none cured thievish undivided undivided hurt or body large page page up willing ruin'd drops of friend grace to thee leave is so true aright is writ ' prove not seen decay live told to find away thee friend's rare more bright find room so more so strong report new can lose hell truth than one wrong breathers of blood ' lie in my age eye even so in my way mind much right their mind out out cheek my moan of brow myself alone had stol'n

Eduardo Avelar

C3W4: Predicting the next word

C3W4: Predicting the next word#