C2W2: Tackle Overfitting with Data Augmentation
Contents
C2W2: Tackle Overfitting with Data Augmentation#
import tensorflow as tf
import matplotlib.pyplot as plt
import pathlib
import shutil
import random
import os
from tensorflow.keras import layers, losses
from_scratch = False
dataset_url = "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip"
dataset_file = tf.keras.utils.get_file(origin=dataset_url, extract=True if from_scratch else False)
data_path = pathlib.Path(os.path.join(os.path.dirname(dataset_file), 'PetImages'))
source_path_cats = os.path.join(data_path, 'Cat')
source_path_dogs = os.path.join(data_path, 'Dog')
Data cleaning#
for file in data_path.glob('*/*.db'):
os.remove(file)
print(f'Remove {str(file)}')
def print_num_images():
print(f'There are {len(os.listdir(source_path_dogs))} images of dogs')
print(f'There are {len(os.listdir(source_path_cats))} images of cats')
print_num_images()
There are 12494 images of dogs
There are 12497 images of cats
if from_scratch:
# use script for multiprocessing
for image_path in data_path.glob('*/*.jpg'):
try:
img_bytes = tf.io.read_file(str(image_path))
tf.io.decode_image(img_bytes)
except tf.errors.InvalidArgumentError as e:
print(f'Remove {image_path}')
os.remove(image_path)
print_num_images()
There are 12494 images of dogs
There are 12497 images of cats
data_augmentation = tf.keras.Sequential([
layers.RandomRotation(0.1),
# layers.RandomTranslation(0, 0.2),
layers.RandomZoom(0.2),
layers.RandomFlip('horizontal')
])
train_dataset = tf.keras.utils.image_dataset_from_directory(
data_path,
label_mode='binary',
batch_size=128,
image_size=(150, 150),
seed=42,
validation_split=0.1,
subset='training'
).cache().prefetch(tf.data.AUTOTUNE)
validation_dataset = tf.keras.utils.image_dataset_from_directory(
data_path,
label_mode='binary',
batch_size=32,
image_size=(150, 150),
seed=42,
validation_split=0.1,
subset='validation'
).cache().prefetch(tf.data.AUTOTUNE)
Found 24991 files belonging to 2 classes.
Using 22492 files for training.
Found 24991 files belonging to 2 classes.
Using 2499 files for validation.
Configure the dataset for performance#
class myCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if logs['accuracy'] > 0.8 and logs['val_accuracy'] > 0.8:
print('\nReached 80% accuracy and validation accuracy so cancelling training')
self.model.stop_training = True
def create_model():
model = tf.keras.Sequential([
layers.Rescaling(1/255, input_shape=(150, 150, 3)),
data_augmentation,
layers.Conv2D(16, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(32, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(64, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Flatten(),
layers.Dense(512, activation='relu'),
layers.Dense(1)])
model.compile(optimizer='rmsprop',
loss=losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
return model
model = create_model()
callbacks = myCallback()
history = model.fit(train_dataset,
epochs=15,
verbose=1,
validation_data=validation_dataset,
callbacks=[callbacks])
Epoch 1/15
176/176 [==============================] - 52s 275ms/step - loss: 0.7753 - accuracy: 0.5505 - val_loss: 0.6213 - val_accuracy: 0.5758
Epoch 2/15
176/176 [==============================] - 46s 259ms/step - loss: 0.6044 - accuracy: 0.6481 - val_loss: 0.5230 - val_accuracy: 0.7051
Epoch 3/15
176/176 [==============================] - 44s 252ms/step - loss: 0.5534 - accuracy: 0.6990 - val_loss: 0.6178 - val_accuracy: 0.6439
Epoch 4/15
176/176 [==============================] - 43s 246ms/step - loss: 0.5153 - accuracy: 0.7297 - val_loss: 0.4446 - val_accuracy: 0.7723
Epoch 5/15
176/176 [==============================] - 44s 248ms/step - loss: 0.4867 - accuracy: 0.7499 - val_loss: 0.4405 - val_accuracy: 0.7963
Epoch 6/15
176/176 [==============================] - 43s 247ms/step - loss: 0.4687 - accuracy: 0.7648 - val_loss: 0.4069 - val_accuracy: 0.8107
Epoch 7/15
176/176 [==============================] - 50s 283ms/step - loss: 0.4551 - accuracy: 0.7769 - val_loss: 0.4323 - val_accuracy: 0.8171
Epoch 8/15
176/176 [==============================] - 51s 287ms/step - loss: 0.4384 - accuracy: 0.7822 - val_loss: 0.3998 - val_accuracy: 0.8327
Epoch 9/15
176/176 [==============================] - 51s 289ms/step - loss: 0.4236 - accuracy: 0.7949 - val_loss: 0.4209 - val_accuracy: 0.8255
Epoch 10/15
176/176 [==============================] - ETA: 0s - loss: 0.4137 - accuracy: 0.8000
Reached 80% accuracy and validation accuracy so cancelling training
176/176 [==============================] - 51s 288ms/step - loss: 0.4137 - accuracy: 0.8000 - val_loss: 0.3593 - val_accuracy: 0.8339
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training y Validation Accuracy')
plt.subplot(1, 2, 2)
plt.plot(loss, label='Training loss')
plt.plot(val_loss, label='Validation loss')
plt.legend(loc='upper right')
plt.title('Trainind and Validation Loss')
plt.show()