# /// script # requires-python = ">=3.11" # dependencies = [ # "polars>=0.20.0", # "pyarrow>=18.0.0", # "sqlalchemy>=2.0.0", # "pyodbc>=5.0.0", # "clickhouse-connect>=0.7.0", # "clickhouse-sqlalchemy>=0.3.2", # "pyyaml>=6.0.3", # "python-dotenv>=1.0.0", # ] # /// from __future__ import annotations import os # import pyarrow import sys from datetime import date, timedelta, datetime import polars as pl import yaml from dotenv import load_dotenv from sqlalchemy import create_engine, text from sqlalchemy.engine import Engine, URL import clickhouse_connect from log import log from clickhouse_task.create_table import * from clickhouse_task.delete_task import * from clickhouse_task.load_table import * from db_con.connection import * from mids import * from masters.dimensions import * from masters.bridge import * from kpi.facts import * def main(): log.info("=" * 80) log.info("Hello from data-move Python data pipeline !") if len(sys.argv) > 1: run_date = datetime.strptime(sys.argv[1], "%Y-%m-%d").date() else: run_date = date.today() - timedelta(days=1) log.info(f"Data-pipeline running Date is -:{run_date}") # connecting with both db servers sql-server log.info("connecting with both db servers sql-serveras well as clickhouse DB") sql_engine = build_sql_server_engine() clickhouse_engine = build_clickhouse_engine() client=get_clickhouse_client() log.info("Both databases connected successfully") mids = MID_TABLE_COV(sql_engine, run_date) emp_visit_df = MID_TABLE_COV1( sql_engine, run_date ) delete_existing_data( client=client, run_date=run_date, mids=mids, emp_visit_df=emp_visit_df, ) mid_list = ",".join(map(str, mids)) conditions = { "mids": f"MID IN ({mid_list})", "j_plan": ( f"MONTH(VisitDate) = {run_date.month} " f"AND YEAR(VisitDate) = {run_date.year}" ), "mapping": ( f"CAST(Z.FromDate AS DATE) <= '{run_date}' " f"AND CAST(Z.ToDate AS DATE) >= '{run_date}'" ), "web": ( f"CAST(login_date AS DATE) = '{run_date}'" ), "none": None, } # fetching polar df from sql-server with open("tables.yml", "r") as file: config = yaml.safe_load(file) for table in config["tables"]: table_name=table["name"] table_type=table["type"] operation=table["operation"] condition=table["condition"] c = conditions.get(condition) log.info("=" * 80) log.info("TABLE=%s | TYPE=%s | OPERATION=%s", table_name, table_type, operation) fn=f"fetch_{table_name}" fetch_by = table["fetch_by"] if operation == "DELETE+INSERT" : if fetch_by == "mids": df = globals()[fn](sql_engine, mids) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: truncate_table(client , table_name ) log.info(f"Truncate a ClickHouse table - {table_name}") load_to_clickhouse(client=client,table_name=table_name,df=df) elif fetch_by == "run_date": df = globals()[fn](sql_engine, run_date) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: truncate_table(client , table_name ) log.info(f"Truncate a ClickHouse table - {table_name}") load_to_clickhouse(client=client,table_name=table_name,df=df) else: df = globals()[fn](sql_engine) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: truncate_table(client , table_name ) log.info(f"Truncate a ClickHouse table - {table_name}") load_to_clickhouse(client=client,table_name=table_name,df=df) else: if fetch_by == "mids": df = globals()[fn](sql_engine, mids) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: delete_rows(client, table_name, c ) load_to_clickhouse(client=client,table_name=table_name,df=df) elif fetch_by == "run_date": df = globals()[fn](sql_engine, run_date) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: delete_rows(client, table_name, c) load_to_clickhouse(client=client,table_name=table_name,df=df) else: df = globals()[fn](sql_engine) check_query=f"EXISTS TABLE {table_name}" exists = client.command(check_query) if exists == 0 : create_clickhouse_table(df, table_name, clickhouse_engine) else: truncate_table(client , table_name ) log.info(f"Truncate a ClickHouse table - {table_name}") load_to_clickhouse(client=client,table_name=table_name,df=df) if __name__ == "__main__": main()