tfdv_basic.ipynb
tfdv_basic.ipynb#
Based on:
../data_validation/tfdv_basic.ipynb
import os
import tensorflow as tf
import tensorflow_data_validation as tfdv
dataset_url = 'https://storage.googleapis.com/artifacts.tfx-oss-public.appspot.com/datasets'
dataset_path = tf.keras.utils.get_file(origin=f'{dataset_url}/chicago_data.zip',
cache_subdir='chicago_data', extract=True)
BASE_DIR = os.path.dirname(dataset_path)
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'chicago_taxi_output')
TRAIN_DATA = os.path.join(DATA_DIR, 'train', 'data.csv')
EVAL_DATA = os.path.join(DATA_DIR, 'eval', 'data.csv')
SERVING_DATA = os.path.join(DATA_DIR, 'serving', 'data.csv')
train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA)
tfdv.visualize_statistics(train_stats)
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)
Type | Presence | Valency | Domain | |
---|---|---|---|---|
Feature name | ||||
'pickup_community_area' | INT | required | - | |
'fare' | FLOAT | required | - | |
'trip_start_month' | INT | required | - | |
'trip_start_hour' | INT | required | - | |
'trip_start_day' | INT | required | - | |
'trip_start_timestamp' | INT | required | - | |
'pickup_latitude' | FLOAT | required | - | |
'pickup_longitude' | FLOAT | required | - | |
'dropoff_latitude' | FLOAT | optional | single | - |
'dropoff_longitude' | FLOAT | optional | single | - |
'trip_miles' | FLOAT | required | - | |
'pickup_census_tract' | BYTES | optional | - | |
'dropoff_census_tract' | INT | optional | single | - |
'payment_type' | STRING | required | 'payment_type' | |
'company' | STRING | optional | single | 'company' |
'trip_seconds' | INT | required | - | |
'dropoff_community_area' | INT | optional | single | - |
'tips' | FLOAT | required | - |
Values | |
---|---|
Domain | |
'payment_type' | 'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown' |
'company' | '0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3385 - 23210 Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6488 - 83287 Zuha Taxi', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation' |
eval_stats = tfdv.generate_statistics_from_csv(EVAL_DATA)
tfdv.visualize_statistics(eval_stats, train_stats,
lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')
anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'company' | Unexpected string values | Examples contain values missing from the schema: 2092 - 61288 Sbeih company (<1%), 2192 - 73487 Zeymane Corp (<1%), 2192 - Zeymane Corp (<1%), 2823 - 73307 Seung Lee (<1%), 3094 - 24059 G.L.B. Cab Co (<1%), 3319 - CD Cab Co (<1%), 3385 - Eman Cab (<1%), 3897 - 57856 Ilie Malec (<1%), 4053 - 40193 Adwar H. Nikola (<1%), 4197 - Royal Star (<1%), 585 - 88805 Valley Cab Co (<1%), 5874 - Sergey Cab Corp. (<1%), 6057 - 24657 Richard Addo (<1%), 6574 - Babylon Express Inc. (<1%), 6742 - 83735 Tasha ride inc (<1%). |
'payment_type' | Unexpected string values | Examples contain values missing from the schema: Prcard (<1%). |
company = tfdv.get_feature(schema, 'company')
company.distribution_constraints.min_domain_mass = 0.9
payment_type_domain = tfdv.get_domain(schema, 'payment_type')
payment_type_domain.value.append('Prcard')
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)
No anomalies found.
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(serving_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'tips' | Column dropped | Column is completely missing |
options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA, stats_options=options)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)
tfdv.display_anomalies(serving_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'tips' | Column dropped | Column is completely missing |
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')
tfdv.get_feature(schema, 'tips').not_in_environment.append('SERVING')
serving_anomalies = tfdv.validate_statistics(serving_stats, schema, 'SERVING')
tfdv.display_anomalies(serving_anomalies)
No anomalies found.
payment_type = tfdv.get_feature(schema, 'payment_type')
payment_type.skew_comparator.infinity_norm.threshold = 0.01
company=tfdv.get_feature(schema, 'company')
company.drift_comparator.infinity_norm.threshold = 0.001
skew_anomalies = tfdv.validate_statistics(train_stats, schema,
previous_statistics=eval_stats,
serving_statistics=serving_stats)
tfdv.display_anomalies(skew_anomalies)
Anomaly short description | Anomaly long description | |
---|---|---|
Feature name | ||
'payment_type' | High Linfty distance between training and serving | The Linfty distance between training and serving is 0.0225 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: Credit Card |
'company' | High Linfty distance between current and previous | The Linfty distance between current and previous is 0.00820891 (up to six significant digits), above the threshold 0.001. The feature value with maximum difference is: Blue Ribbon Taxi Association Inc. |
from tensorflow.python.lib.io import file_io
file_io.recursive_create_dir(OUTPUT_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)
!code {schema_file}