CluedIn Data Quality Metrics in Microsoft Fabric and Power BI

youtu.be/hRo965y_f9w?si=hsc0uT18XXJt6AZO

You can find the code used in the video further below.

CLUEDIN_USERNAME='...'
CLUEDIN_PASSWORD='...'
CLUEDIN_API_TOKEN='...'
CLUEDIN_ORG_NAME='...'
CLUEDIN_DOMAIN='...'

%pip install cluedin

import cluedin

ctx = cluedin.Context.from_dict({
  'domain': CLUEDIN_DOMAIN,
  'org_name': CLUEDIN_ORG_NAME,
  'access_token': CLUEDIN_API_TOKEN
})

import pandas as pd

query = """
query searchEntities($cursor: PagingCursor, $query: String, $pageSize: Int) {
  search(
    query: $query
    cursor: $cursor
    pageSize: $pageSize
    sort: FIELDS
    sortFields: {field: "id", direction: ASCENDING}
  ) {
    totalResults
    cursor
    entries {
      id
      name
      entityType
      properties
    }
  }
}
"""

variables = {
    'query': 'entityType:/IMDb/Name',
    'pageSize': 10_000
}

df = pd.DataFrame(
    cluedin.gql.entries(
        context=ctx,
        query=query,
        variables=variables,
        flat=True))

df

df.set_index('id', inplace=True)
df.head(5)

df.shape

df = df.rename(columns=lambda x: x.replace('imdb.name.', ''))
df.head(5)

has_birth_year_records_count = (df['birthYear'] != 'N/A').sum()
total_records_count = len(df)

completeness_of_birth_year = has_birth_year_records_count / total_records_count

print(f"Completeness of birthYear: {round(completeness_of_birth_year * 100, 2)}%")

spark \
    .createDataFrame(df) \
    .write.mode('overwrite') \
    .format('delta') \
    .saveAsTable('imdb_names')

display(spark.sql("SELECT * FROM mdm_lakehouse.imdb_names WHERE LIMIT 10"))

Birth Year Completeness = 
DIVIDE(
    COUNTROWS(
        FILTER(
            imdb_names, 
            imdb_names[birthYear] <> "N/A"
        )
    ), 
    COUNTROWS(imdb_names)
)