tfdv_basic_spending.ipynb
tfdv_basic_spending.ipynb#
Based on:
../production_ml/solutions/tfdv_basic_spending.ipynb
import pandas as pd
import tensorflow_data_validation as tfdv
def get_csv(csv_name):
repo = 'https://raw.githubusercontent.com/GoogleCloudPlatform/training-data-analyst'
path = 'master/courses/machine_learning/deepdive2/production_ml/solutions/data'
return pd.read_csv(f'{repo}/{path}/{csv_name}')
score_train = get_csv('score_train.csv')
score_train.head()
Graduated | Profession | Work_Experience | Family_Size | Spending_Score | |
---|---|---|---|---|---|
0 | No | Healthcare | 1.0 | 4.0 | Low |
1 | Yes | Engineer | NaN | 3.0 | Average |
2 | Yes | Engineer | 1.0 | 1.0 | Low |
3 | Yes | Lawyer | 0.0 | 2.0 | High |
4 | Yes | Entertainment | NaN | 6.0 | High |
score_test = get_csv('score_test.csv')
score_test.head()
Graduated | Profession | Work_Experience | Family_Size | Spending_Score | |
---|---|---|---|---|---|
0 | No | Doctor | 0.0 | 5.0 | Average |
1 | Yes | Entertainment | 1.0 | 4.0 | Average |
2 | No | Lawyer | 0.0 | 5.0 | Low |
3 | Yes | Executive | 1.0 | 5.0 | High |
4 | Yes | Artist | 1.0 | 2.0 | Average |
score_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Graduated 3964 non-null object
1 Profession 3944 non-null object
2 Work_Experience 3589 non-null float64
3 Family_Size 3831 non-null float64
4 Spending_Score 4000 non-null object
dtypes: float64(2), object(3)
memory usage: 156.4+ KB
[method for method in dir(tfdv) if not method.startswith('_')]
['CombinerStatsGenerator',
'CrossFeatureView',
'DatasetListView',
'DatasetView',
'DetectFeatureSkew',
'FeaturePath',
'FeatureView',
'GenerateStatistics',
'MergeDatasetFeatureStatisticsList',
'StatsOptions',
'TransformStatsGenerator',
'WriteStatisticsToBinaryFile',
'WriteStatisticsToRecordsAndBinaryFile',
'WriteStatisticsToTFRecord',
'anomalies',
'api',
'arrow',
'coders',
'compare_slices',
'constants',
'default_sharded_output_suffix',
'default_sharded_output_supported',
'display_anomalies',
'display_schema',
'experimental_get_feature_value_slicer',
'generate_dummy_schema_with_paths',
'generate_statistics_from_csv',
'generate_statistics_from_dataframe',
'generate_statistics_from_tfrecord',
'get_confusion_count_dataframes',
'get_domain',
'get_feature',
'get_feature_stats',
'get_match_stats_dataframe',
'get_skew_result_dataframe',
'get_slice_stats',
'get_statistics_html',
'infer_schema',
'load_anomalies_text',
'load_schema_text',
'load_sharded_statistics',
'load_statistics',
'load_stats_binary',
'load_stats_text',
'pywrap',
'set_domain',
'skew',
'statistics',
'types',
'update_schema',
'utils',
'validate_corresponding_slices',
'validate_examples_in_csv',
'validate_examples_in_tfrecord',
'validate_statistics',
'version',
'visualize_statistics',
'write_anomalies_text',
'write_schema_text',
'write_stats_text']
stats = tfdv.generate_statistics_from_dataframe(score_train)
tfdv.visualize_statistics(stats)
train_stats = tfdv.generate_statistics_from_dataframe(score_train)
test_stats = tfdv.generate_statistics_from_dataframe(score_test)
tfdv.visualize_statistics(train_stats, test_stats,
lhs_name='TRAIN_DATASET', rhs_name='NEW_DATASET')
schema = tfdv.infer_schema(stats)
tfdv.display_schema(schema)
Type | Presence | Valency | Domain | |
---|---|---|---|---|
Feature name | ||||
'Graduated' | STRING | optional | single | 'Graduated' |
'Profession' | STRING | optional | single | 'Profession' |
'Work_Experience' | FLOAT | optional | single | - |
'Family_Size' | FLOAT | optional | single | - |
'Spending_Score' | STRING | required | 'Spending_Score' |
Values | |
---|---|
Domain | |
'Graduated' | 'No', 'Yes' |
'Profession' | 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing' |
'Spending_Score' | 'Average', 'High', 'Low' |
tfdv.get_feature(schema, 'Graduated').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Profession').presence.min_fraction = 1.0
tfdv.get_feature(schema, 'Family_Size').presence.min_fraction = 1.0
tfdv.display_schema(schema)
Type | Presence | Valency | Domain | |
---|---|---|---|---|
Feature name | ||||
'Graduated' | STRING | required | single | 'Graduated' |
'Profession' | STRING | required | single | 'Profession' |
'Work_Experience' | FLOAT | optional | single | - |
'Family_Size' | FLOAT | required | single | - |
'Spending_Score' | STRING | required | 'Spending_Score' |
Values | |
---|---|
Domain | |
'Graduated' | 'No', 'Yes' |
'Profession' | 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing' |
'Spending_Score' | 'Average', 'High', 'Low' |
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.insert(0, 'Self-Employed')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Homemaker', 'Lawyer', 'Marketing']
Profesion_domain = tfdv.get_domain(schema, 'Profession')
Profesion_domain.value.remove('Homemaker')
Profesion_domain.value
['Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing']
tfdv.get_feature(schema, 'Family_Size').type = 2
tfdv.display_schema(schema)
Type | Presence | Valency | Domain | |
---|---|---|---|---|
Feature name | ||||
'Graduated' | STRING | required | single | 'Graduated' |
'Profession' | STRING | required | single | 'Profession' |
'Work_Experience' | FLOAT | optional | single | - |
'Family_Size' | INT | required | single | - |
'Spending_Score' | STRING | required | 'Spending_Score' |
Values | |
---|---|
Domain | |
'Graduated' | 'No', 'Yes' |
'Profession' | 'Self-Employed', 'Artist', 'Doctor', 'Engineer', 'Entertainment', 'Executive', 'Healthcare', 'Lawyer', 'Marketing' |
'Spending_Score' | 'Average', 'High', 'Low' |