Files
data_pipeline/main.py
T
2026-06-15 12:44:13 +05:30

251 lines
6.2 KiB
Python

# /// script
# requires-python = ">=3.11"
# dependencies = [
# "polars>=0.20.0",
# "pyarrow>=18.0.0",
# "sqlalchemy>=2.0.0",
# "pyodbc>=5.0.0",
# "clickhouse-connect>=0.7.0",
# "clickhouse-sqlalchemy>=0.3.2",
# "pyyaml>=6.0.3",
# "python-dotenv>=1.0.0",
# ]
# ///
from __future__ import annotations
import os
# import pyarrow
import sys
from datetime import date, timedelta, datetime
import polars as pl
import yaml
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine, URL
import clickhouse_connect
from log import log
from clickhouse_task.create_table import *
from clickhouse_task.delete_task import *
from clickhouse_task.load_table import *
from db_con.connection import *
from mids import *
from masters.dimensions import *
from masters.bridge import *
from kpi.facts import *
def main():
log.info("=" * 80)
log.info("Hello from data-move Python data pipeline !")
if len(sys.argv) > 1:
run_date = datetime.strptime(sys.argv[1], "%Y-%m-%d").date()
else:
run_date = date.today() - timedelta(days=1)
log.info(f"Data-pipeline running Date is -:{run_date}")
# connecting with both db servers sql-server
log.info("connecting with both db servers sql-serveras well as clickhouse DB")
sql_engine = build_sql_server_engine()
clickhouse_engine = build_clickhouse_engine()
client=get_clickhouse_client()
log.info("Both databases connected successfully")
mids = MID_TABLE_COV(sql_engine, run_date)
emp_visit_df = MID_TABLE_COV1(
sql_engine,
run_date
)
delete_existing_data(
client=client,
run_date=run_date,
mids=mids,
emp_visit_df=emp_visit_df,
)
mid_list = ",".join(map(str, mids))
conditions = {
"mids": f"MID IN ({mid_list})",
"j_plan": (
f"MONTH(VisitDate) = {run_date.month} "
f"AND YEAR(VisitDate) = {run_date.year}"
),
"mapping": (
f"CAST(Z.FromDate AS DATE) <= '{run_date}' "
f"AND CAST(Z.ToDate AS DATE) >= '{run_date}'"
),
"web": (
f"CAST(login_date AS DATE) = '{run_date}'"
),
"none": None,
}
# fetching polar df from sql-server
with open("tables.yml", "r") as file:
config = yaml.safe_load(file)
for table in config["tables"]:
table_name=table["name"]
table_type=table["type"]
operation=table["operation"]
condition=table["condition"]
c = conditions.get(condition)
log.info("=" * 80)
log.info("TABLE=%s | TYPE=%s | OPERATION=%s",
table_name,
table_type,
operation)
fn=f"fetch_{table_name}"
fetch_by = table["fetch_by"]
if operation == "DELETE+INSERT" :
if fetch_by == "mids":
df = globals()[fn](sql_engine, mids)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
truncate_table(client , table_name )
log.info(f"Truncate a ClickHouse table - {table_name}")
load_to_clickhouse(client=client,table_name=table_name,df=df)
elif fetch_by == "run_date":
df = globals()[fn](sql_engine, run_date)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
truncate_table(client , table_name )
log.info(f"Truncate a ClickHouse table - {table_name}")
load_to_clickhouse(client=client,table_name=table_name,df=df)
else:
df = globals()[fn](sql_engine)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
truncate_table(client , table_name )
log.info(f"Truncate a ClickHouse table - {table_name}")
load_to_clickhouse(client=client,table_name=table_name,df=df)
else:
if fetch_by == "mids":
df = globals()[fn](sql_engine, mids)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
delete_rows(client, table_name, c )
load_to_clickhouse(client=client,table_name=table_name,df=df)
elif fetch_by == "run_date":
df = globals()[fn](sql_engine, run_date)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
delete_rows(client, table_name, c)
load_to_clickhouse(client=client,table_name=table_name,df=df)
else:
df = globals()[fn](sql_engine)
check_query=f"EXISTS TABLE {table_name}"
exists = client.command(check_query)
if exists == 0 :
create_clickhouse_table(df, table_name, clickhouse_engine)
else:
truncate_table(client , table_name )
log.info(f"Truncate a ClickHouse table - {table_name}")
load_to_clickhouse(client=client,table_name=table_name,df=df)
if __name__ == "__main__":
main()