tfdv_basic_spending.ipynb
tfdv_basic_spending.ipynb#
Based on:
../production_ml/solutions/tfdv_basic_spending.ipynb
import pandas as pd
import tensorflow_data_validation as tfdv
def get_csv(csv_name):
    repo = 'https://raw.githubusercontent.com/GoogleCloudPlatform/training-data-analyst'
    path = 'master/courses/machine_learning/deepdive2/production_ml/solutions/data'
    return pd.read_csv(f'{repo}/{path}/{csv_name}')
score_train = get_csv('score_train.csv')
score_train.head()
| Graduated | Profession | Work_Experience | Family_Size | Spending_Score | |
|---|---|---|---|---|---|
| 0 | No | Healthcare | 1.0 | 4.0 | Low | 
| 1 | Yes | Engineer | NaN | 3.0 | Average | 
| 2 | Yes | Engineer | 1.0 | 1.0 | Low | 
| 3 | Yes | Lawyer | 0.0 | 2.0 | High | 
| 4 | Yes | Entertainment | NaN | 6.0 | High | 
score_test = get_csv('score_test.csv')
score_test.head()
| Graduated | Profession | Work_Experience | Family_Size | Spending_Score | |
|---|---|---|---|---|---|
| 0 | No | Doctor | 0.0 | 5.0 | Average | 
| 1 | Yes | Entertainment | 1.0 | 4.0 | Average | 
| 2 | No | Lawyer | 0.0 | 5.0 | Low | 
| 3 | Yes | Executive | 1.0 | 5.0 | High | 
| 4 | Yes | Artist | 1.0 | 2.0 | Average | 
score_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Graduated        3964 non-null   object 
 1   Profession       3944 non-null   object 
 2   Work_Experience  3589 non-null   float64
 3   Family_Size      3831 non-null   float64
 4   Spending_Score   4000 non-null   object 
dtypes: float64(2), object(3)
memory usage: 156.4+ KB
[method for method in dir(tfdv) if not method.startswith('_')]
['CombinerStatsGenerator',
 'CrossFeatureView',
 'DatasetListView',
 'DatasetView',
 'DetectFeatureSkew',
 'FeaturePath',
 'FeatureView',
 'GenerateStatistics',
 'MergeDatasetFeatureStatisticsList',
 'StatsOptions',
 'TransformStatsGenerator',
 'WriteStatisticsToBinaryFile',
 'WriteStatisticsToRecordsAndBinaryFile',
 'WriteStatisticsToTFRecord',
 'anomalies',
 'api',
 'arrow',
 'coders',
 'compare_slices',
 'constants',
 'default_sharded_output_suffix',
 'default_sharded_output_supported',
 'display_anomalies',
 'display_schema',
 'experimental_get_feature_value_slicer',
 'generate_dummy_schema_with_paths',
 'generate_statistics_from_csv',
 'generate_statistics_from_dataframe',
 'generate_statistics_from_tfrecord',
 'get_confusion_count_dataframes',
 'get_domain',
 'get_feature',
 'get_feature_stats',
 'get_match_stats_dataframe',
 'get_skew_result_dataframe',
 'get_slice_stats',
 'get_statistics_html',
 'infer_schema',
 'load_anomalies_text',
 'load_schema_text',
 'load_sharded_statistics',
 'load_statistics',
 'load_stats_binary',
 'load_stats_text',
 'pywrap',
 'set_domain',
 'skew',
 'statistics',
 'types',
 'update_schema',
 'utils',
 'validate_corresponding_slices',
 'validate_examples_in_csv',
 'validate_examples_in_tfrecord',
 'validate_statistics',
 'version',
 'visualize_statistics',
 'write_anomalies_text',
 'write_schema_text',
 'write_stats_text']
stats = tfdv.generate_statistics_from_dataframe(score_train)
tfdv.visualize_statistics(stats)
train_stats = tfdv.generate_statistics_from_dataframe(score_train)
test_stats = tfdv.generate_statistics_from_dataframe(score_test)
tfdv.visualize_statistics(train_stats, test_stats, 
                          lhs_name='TRAIN_DATASET', rhs_name='NEW_DATASET')
schema = tfdv.infer_schema(stats)
tfdv.display_schema(schema)
| Type | Presence | Valency | Domain | |
|---|---|---|---|---|
| Feature name | ||||
| 'Graduated' | STRING | optional | single | 'Graduated' | 
| 'Profession' | STRING | optional | single | 'Profession' | 
| 'Work_Experience' | FLOAT | optional | single | - | 
| 'Family_Size' | FLOAT | optional | single | - | 
| 'Spending_Score' | STRING | required | 'Spending_Score' | 
| Values | |
|---|---|
| Domain | |
| 'Graduated' | 'No', 'Yes' | 
| 'Profession' | 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing' | 
| 'Spending_Score' | 'Average', 'High', 'Low' | 
tfdv.get_feature(schema, 'Graduated').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Profession').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Family_Size').presence.min_fraction = 1.0
tfdv.display_schema(schema)
| Type | Presence | Valency | Domain | |
|---|---|---|---|---|
| Feature name | ||||
| 'Graduated' | STRING | required | single | 'Graduated' | 
| 'Profession' | STRING | required | single | 'Profession' | 
| 'Work_Experience' | FLOAT | optional | single | - | 
| 'Family_Size' | FLOAT | required | single | - | 
| 'Spending_Score' | STRING | required | 'Spending_Score' | 
| Values | |
|---|---|
| Domain | |
| 'Graduated' | 'No', 'Yes' | 
| 'Profession' | 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing' | 
| 'Spending_Score' | 'Average', 'High', 'Low' | 
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.insert(0, 'Self-Employed')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing']
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.remove('Homemaker')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing']
tfdv.get_feature(schema, 'Family_Size').type = 2
tfdv.display_schema(schema)
| Type | Presence | Valency | Domain | |
|---|---|---|---|---|
| Feature name | ||||
| 'Graduated' | STRING | required | single | 'Graduated' | 
| 'Profession' | STRING | required | single | 'Profession' | 
| 'Work_Experience' | FLOAT | optional | single | - | 
| 'Family_Size' | INT | required | single | - | 
| 'Spending_Score' | STRING | required | 'Spending_Score' | 
| Values | |
|---|---|
| Domain | |
| 'Graduated' | 'No', 'Yes' | 
| 'Profession' | 'Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing' | 
| 'Spending_Score' | 'Average', 'High', 'Low' |