CluedIn Data Quality Metrics in Microsoft Fabric and Power BI
youtu.be/hRo965y_f9w?si=hsc0uT18XXJt6AZO
You can find the code used in the video further below.
CLUEDIN_USERNAME='...'
CLUEDIN_PASSWORD='...'
CLUEDIN_API_TOKEN='...'
CLUEDIN_ORG_NAME='...'
CLUEDIN_DOMAIN='...'
%pip install cluedin
import cluedin
ctx = cluedin.Context.from_dict({
'domain': CLUEDIN_DOMAIN,
'org_name': CLUEDIN_ORG_NAME,
'access_token': CLUEDIN_API_TOKEN
})
import pandas as pd
query = """
query searchEntities($cursor: PagingCursor, $query: String, $pageSize: Int) {
search(
query: $query
cursor: $cursor
pageSize: $pageSize
sort: FIELDS
sortFields: {field: "id", direction: ASCENDING}
) {
totalResults
cursor
entries {
id
name
entityType
properties
}
}
}
"""
variables = {
'query': 'entityType:/IMDb/Name',
'pageSize': 10_000
}
df = pd.DataFrame(
cluedin.gql.entries(
context=ctx,
query=query,
variables=variables,
flat=True))
df
df.set_index('id', inplace=True)
df.head(5)
df.shape
df = df.rename(columns=lambda x: x.replace('imdb.name.', ''))
df.head(5)
has_birth_year_records_count = (df['birthYear'] != 'N/A').sum()
total_records_count = len(df)
completeness_of_birth_year = has_birth_year_records_count / total_records_count
print(f"Completeness of birthYear: {round(completeness_of_birth_year * 100, 2)}%")
spark \
.createDataFrame(df) \
.write.mode('overwrite') \
.format('delta') \
.saveAsTable('imdb_names')
display(spark.sql("SELECT * FROM mdm_lakehouse.imdb_names WHERE LIMIT 10"))
Birth Year Completeness =
DIVIDE(
COUNTROWS(
FILTER(
imdb_names,
imdb_names[birthYear] <> "N/A"
)
),
COUNTROWS(imdb_names)
)