tfdv_basic_spending.ipynb#

Based on:

import pandas as pd
import tensorflow_data_validation as tfdv
def get_csv(csv_name):
    repo = 'https://raw.githubusercontent.com/GoogleCloudPlatform/training-data-analyst'
    path = 'master/courses/machine_learning/deepdive2/production_ml/solutions/data'
    return pd.read_csv(f'{repo}/{path}/{csv_name}')
score_train = get_csv('score_train.csv')
score_train.head()
Graduated Profession Work_Experience Family_Size Spending_Score
0 No Healthcare 1.0 4.0 Low
1 Yes Engineer NaN 3.0 Average
2 Yes Engineer 1.0 1.0 Low
3 Yes Lawyer 0.0 2.0 High
4 Yes Entertainment NaN 6.0 High
score_test = get_csv('score_test.csv')
score_test.head()
Graduated Profession Work_Experience Family_Size Spending_Score
0 No Doctor 0.0 5.0 Average
1 Yes Entertainment 1.0 4.0 Average
2 No Lawyer 0.0 5.0 Low
3 Yes Executive 1.0 5.0 High
4 Yes Artist 1.0 2.0 Average
score_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Graduated        3964 non-null   object 
 1   Profession       3944 non-null   object 
 2   Work_Experience  3589 non-null   float64
 3   Family_Size      3831 non-null   float64
 4   Spending_Score   4000 non-null   object 
dtypes: float64(2), object(3)
memory usage: 156.4+ KB
[method for method in dir(tfdv) if not method.startswith('_')]
['CombinerStatsGenerator',
 'CrossFeatureView',
 'DatasetListView',
 'DatasetView',
 'DetectFeatureSkew',
 'FeaturePath',
 'FeatureView',
 'GenerateStatistics',
 'MergeDatasetFeatureStatisticsList',
 'StatsOptions',
 'TransformStatsGenerator',
 'WriteStatisticsToBinaryFile',
 'WriteStatisticsToRecordsAndBinaryFile',
 'WriteStatisticsToTFRecord',
 'anomalies',
 'api',
 'arrow',
 'coders',
 'compare_slices',
 'constants',
 'default_sharded_output_suffix',
 'default_sharded_output_supported',
 'display_anomalies',
 'display_schema',
 'experimental_get_feature_value_slicer',
 'generate_dummy_schema_with_paths',
 'generate_statistics_from_csv',
 'generate_statistics_from_dataframe',
 'generate_statistics_from_tfrecord',
 'get_confusion_count_dataframes',
 'get_domain',
 'get_feature',
 'get_feature_stats',
 'get_match_stats_dataframe',
 'get_skew_result_dataframe',
 'get_slice_stats',
 'get_statistics_html',
 'infer_schema',
 'load_anomalies_text',
 'load_schema_text',
 'load_sharded_statistics',
 'load_statistics',
 'load_stats_binary',
 'load_stats_text',
 'pywrap',
 'set_domain',
 'skew',
 'statistics',
 'types',
 'update_schema',
 'utils',
 'validate_corresponding_slices',
 'validate_examples_in_csv',
 'validate_examples_in_tfrecord',
 'validate_statistics',
 'version',
 'visualize_statistics',
 'write_anomalies_text',
 'write_schema_text',
 'write_stats_text']
stats = tfdv.generate_statistics_from_dataframe(score_train)
tfdv.visualize_statistics(stats)
train_stats = tfdv.generate_statistics_from_dataframe(score_train)
test_stats = tfdv.generate_statistics_from_dataframe(score_test)

tfdv.visualize_statistics(train_stats, test_stats, 
                          lhs_name='TRAIN_DATASET', rhs_name='NEW_DATASET')
schema = tfdv.infer_schema(stats)
tfdv.display_schema(schema)
Type Presence Valency Domain
Feature name
'Graduated' STRING optional single 'Graduated'
'Profession' STRING optional single 'Profession'
'Work_Experience' FLOAT optional single -
'Family_Size' FLOAT optional single -
'Spending_Score' STRING required 'Spending_Score'
Values
Domain
'Graduated' 'No', 'Yes'
'Profession' 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing'
'Spending_Score' 'Average', 'High', 'Low'
tfdv.get_feature(schema, 'Graduated').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Profession').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Family_Size').presence.min_fraction = 1.0
tfdv.display_schema(schema)
Type Presence Valency Domain
Feature name
'Graduated' STRING required single 'Graduated'
'Profession' STRING required single 'Profession'
'Work_Experience' FLOAT optional single -
'Family_Size' FLOAT required single -
'Spending_Score' STRING required 'Spending_Score'
Values
Domain
'Graduated' 'No', 'Yes'
'Profession' 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing'
'Spending_Score' 'Average', 'High', 'Low'
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.insert(0, 'Self-Employed')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing']
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.remove('Homemaker')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing']
tfdv.get_feature(schema, 'Family_Size').type = 2
tfdv.display_schema(schema)
Type Presence Valency Domain
Feature name
'Graduated' STRING required single 'Graduated'
'Profession' STRING required single 'Profession'
'Work_Experience' FLOAT optional single -
'Family_Size' INT required single -
'Spending_Score' STRING required 'Spending_Score'
Values
Domain
'Graduated' 'No', 'Yes'
'Profession' 'Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing'
'Spending_Score' 'Average', 'High', 'Low'