tfdv_basic.ipynb

tfdv_basic.ipynb#

Based on:

../data_validation/tfdv_basic.ipynb
commit d2a3417. In case of updates available, compare.

import os
import tensorflow as tf
import tensorflow_data_validation as tfdv

dataset_url = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets'
dataset_path = tf.keras.utils.get_file(origin=f'{dataset_url}/chicago_data.zip', 
                                       cache_subdir='chicago_data', extract=True)

BASE_DIR = os.path.dirname(dataset_path)
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'chicago_taxi_output')
TRAIN_DATA = os.path.join(DATA_DIR, 'train', 'data.csv')
EVAL_DATA = os.path.join(DATA_DIR, 'eval', 'data.csv')
SERVING_DATA = os.path.join(DATA_DIR, 'serving', 'data.csv')

train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA)
tfdv.visualize_statistics(train_stats)

schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

	Type	Presence	Valency	Domain
Feature name
'pickup_community_area'	INT	required		-
'fare'	FLOAT	required		-
'trip_start_month'	INT	required		-
'trip_start_hour'	INT	required		-
'trip_start_day'	INT	required		-
'trip_start_timestamp'	INT	required		-
'pickup_latitude'	FLOAT	required		-
'pickup_longitude'	FLOAT	required		-
'dropoff_latitude'	FLOAT	optional	single	-
'dropoff_longitude'	FLOAT	optional	single	-
'trip_miles'	FLOAT	required		-
'pickup_census_tract'	BYTES	optional		-
'dropoff_census_tract'	INT	optional	single	-
'payment_type'	STRING	required		'payment_type'
'company'	STRING	optional	single	'company'
'trip_seconds'	INT	required		-
'dropoff_community_area'	INT	optional	single	-
'tips'	FLOAT	required		-

	Values
Domain
'payment_type'	'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown'
'company'	'0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3385 - 23210 Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6488 - 83287 Zuha Taxi', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation'

eval_stats = tfdv.generate_statistics_from_csv(EVAL_DATA)
tfdv.visualize_statistics(eval_stats, train_stats, 
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(anomalies)

	Anomaly short description	Anomaly long description
Feature name
'company'	Unexpected string values	Examples contain values missing from the schema: 2092 - 61288 Sbeih company (<1%), 2192 - 73487 Zeymane Corp (<1%), 2192 - Zeymane Corp (<1%), 2823 - 73307 Seung Lee (<1%), 3094 - 24059 G.L.B. Cab Co (<1%), 3319 - CD Cab Co (<1%), 3385 - Eman Cab (<1%), 3897 - 57856 Ilie Malec (<1%), 4053 - 40193 Adwar H. Nikola (<1%), 4197 - Royal Star (<1%), 585 - 88805 Valley Cab Co (<1%), 5874 - Sergey Cab Corp. (<1%), 6057 - 24657 Richard Addo (<1%), 6574 - Babylon Express Inc. (<1%), 6742 - 83735 Tasha ride inc (<1%).
'payment_type'	Unexpected string values	Examples contain values missing from the schema: Prcard (<1%).

company = tfdv.get_feature(schema, 'company')
company.distribution_constraints.min_domain_mass = 0.9

payment_type_domain = tfdv.get_domain(schema, 'payment_type')
payment_type_domain.value.append('Prcard')

updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

No anomalies found.

serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)

tfdv.display_anomalies(serving_anomalies)

	Anomaly short description	Anomaly long description
Feature name
'tips'	Column dropped	Column is completely missing

options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA, stats_options=options)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)

tfdv.display_anomalies(serving_anomalies)

	Anomaly short description	Anomaly long description
Feature name
'tips'	Column dropped	Column is completely missing

schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

tfdv.get_feature(schema, 'tips').not_in_environment.append('SERVING')

serving_anomalies = tfdv.validate_statistics(serving_stats, schema, 'SERVING')

tfdv.display_anomalies(serving_anomalies)

No anomalies found.

payment_type = tfdv.get_feature(schema, 'payment_type')
payment_type.skew_comparator.infinity_norm.threshold = 0.01

company=tfdv.get_feature(schema, 'company')
company.drift_comparator.infinity_norm.threshold = 0.001

skew_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

tfdv.display_anomalies(skew_anomalies)

	Anomaly short description	Anomaly long description
Feature name
'payment_type'	High Linfty distance between training and serving	The Linfty distance between training and serving is 0.0225 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: Credit Card
'company'	High Linfty distance between current and previous	The Linfty distance between current and previous is 0.00820891 (up to six significant digits), above the threshold 0.001. The feature value with maximum difference is: Blue Ribbon Taxi Association Inc.

from tensorflow.python.lib.io import file_io

file_io.recursive_create_dir(OUTPUT_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)

!code {schema_file}