C3W1: Explore the BBC News archive#

BBC News Classification Dataset

  • 2225 examples of new articles with their respective categories (labels).

import tensorflow as tf
from tensorflow.keras import layers
import csv
import re 
import string
def parse_data_from_file(filename):

    sentences = []
    labels = []

    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)

        for row in reader:
            sentences.append(row[1])
            labels.append(row[0])

    return sentences, labels
dataset_path = '../../../data/bbc-text.csv'
sentences, labels = parse_data_from_file(dataset_path)
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
def custom_standardization(input_data):
    
    text = tf.strings.lower(input_data)
    text = tf.strings.regex_replace(text, '[%s]' % re.escape(string.punctuation), ' ')

    for word in stopwords:
        text = tf.strings.regex_replace(text, rf'\b{word}\b', '')

    return text
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization)

vectorize_layer.adapt(sentences)
vocab = vectorize_layer.get_vocabulary()
print(f'len(vocab): {len(vocab)} words')
len(vocab): 29609 words
vectorize_layer(sentences)
<tf.Tensor: shape=(2225, 2396), dtype=int64, numpy=
array([[   96,   176,  1156, ...,     0,     0,     0],
       [ 1602,   607,   251, ...,     0,     0,     0],
       [ 4964,  6926,  3900, ...,     0,     0,     0],
       ...,
       [ 5844,  2189,    10, ...,     0,     0,     0],
       [  384,  9851, 21295, ...,     0,     0,     0],
       [ 2460,  7908,   841, ...,     0,     0,     0]], dtype=int64)>
classes = {label: i for i, label in enumerate(set((labels)))}
classes
{'sport': 0, 'tech': 1, 'entertainment': 2, 'politics': 3, 'business': 4}
label_sequences = [[classes[label]] for label in labels]
label_sequences[:10]
[[1], [4], [0], [0], [2], [3], [3], [0], [0], [2]]