251 lines
6.2 KiB
Python
251 lines
6.2 KiB
Python
# /// script
|
|
# requires-python = ">=3.11"
|
|
# dependencies = [
|
|
# "polars>=0.20.0",
|
|
# "pyarrow>=18.0.0",
|
|
# "sqlalchemy>=2.0.0",
|
|
# "pyodbc>=5.0.0",
|
|
# "clickhouse-connect>=0.7.0",
|
|
# "clickhouse-sqlalchemy>=0.3.2",
|
|
# "pyyaml>=6.0.3",
|
|
# "python-dotenv>=1.0.0",
|
|
# ]
|
|
# ///
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
# import pyarrow
|
|
import sys
|
|
from datetime import date, timedelta, datetime
|
|
|
|
import polars as pl
|
|
import yaml
|
|
from dotenv import load_dotenv
|
|
|
|
from sqlalchemy import create_engine, text
|
|
from sqlalchemy.engine import Engine, URL
|
|
|
|
import clickhouse_connect
|
|
|
|
from log import log
|
|
|
|
|
|
|
|
from clickhouse_task.create_table import *
|
|
from clickhouse_task.delete_task import *
|
|
from clickhouse_task.load_table import *
|
|
from db_con.connection import *
|
|
from mids import *
|
|
from masters.dimensions import *
|
|
from masters.bridge import *
|
|
from kpi.facts import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
log.info("=" * 80)
|
|
log.info("Hello from data-move Python data pipeline !")
|
|
|
|
if len(sys.argv) > 1:
|
|
run_date = datetime.strptime(sys.argv[1], "%Y-%m-%d").date()
|
|
else:
|
|
run_date = date.today() - timedelta(days=1)
|
|
|
|
log.info(f"Data-pipeline running Date is -:{run_date}")
|
|
# connecting with both db servers sql-server
|
|
|
|
|
|
log.info("connecting with both db servers sql-serveras well as clickhouse DB")
|
|
|
|
sql_engine = build_sql_server_engine()
|
|
clickhouse_engine = build_clickhouse_engine()
|
|
client=get_clickhouse_client()
|
|
|
|
log.info("Both databases connected successfully")
|
|
|
|
mids = MID_TABLE_COV(sql_engine, run_date)
|
|
|
|
emp_visit_df = MID_TABLE_COV1(
|
|
sql_engine,
|
|
run_date
|
|
)
|
|
|
|
delete_existing_data(
|
|
client=client,
|
|
run_date=run_date,
|
|
mids=mids,
|
|
emp_visit_df=emp_visit_df,
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mid_list = ",".join(map(str, mids))
|
|
conditions = {
|
|
"mids": f"MID IN ({mid_list})",
|
|
|
|
"j_plan": (
|
|
f"MONTH(VisitDate) = {run_date.month} "
|
|
f"AND YEAR(VisitDate) = {run_date.year}"
|
|
),
|
|
|
|
"mapping": (
|
|
f"CAST(Z.FromDate AS DATE) <= '{run_date}' "
|
|
f"AND CAST(Z.ToDate AS DATE) >= '{run_date}'"
|
|
),
|
|
|
|
"web": (
|
|
f"CAST(login_date AS DATE) = '{run_date}'"
|
|
),
|
|
|
|
"none": None,
|
|
}
|
|
|
|
|
|
|
|
# fetching polar df from sql-server
|
|
|
|
|
|
|
|
|
|
|
|
with open("tables.yml", "r") as file:
|
|
config = yaml.safe_load(file)
|
|
|
|
for table in config["tables"]:
|
|
|
|
table_name=table["name"]
|
|
table_type=table["type"]
|
|
operation=table["operation"]
|
|
condition=table["condition"]
|
|
|
|
c = conditions.get(condition)
|
|
|
|
log.info("=" * 80)
|
|
log.info("TABLE=%s | TYPE=%s | OPERATION=%s",
|
|
table_name,
|
|
table_type,
|
|
operation)
|
|
|
|
|
|
fn=f"fetch_{table_name}"
|
|
fetch_by = table["fetch_by"]
|
|
|
|
if operation == "DELETE+INSERT" :
|
|
if fetch_by == "mids":
|
|
df = globals()[fn](sql_engine, mids)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
truncate_table(client , table_name )
|
|
log.info(f"Truncate a ClickHouse table - {table_name}")
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
|
|
elif fetch_by == "run_date":
|
|
df = globals()[fn](sql_engine, run_date)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
truncate_table(client , table_name )
|
|
log.info(f"Truncate a ClickHouse table - {table_name}")
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
else:
|
|
df = globals()[fn](sql_engine)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
truncate_table(client , table_name )
|
|
log.info(f"Truncate a ClickHouse table - {table_name}")
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
else:
|
|
if fetch_by == "mids":
|
|
df = globals()[fn](sql_engine, mids)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
|
|
delete_rows(client, table_name, c )
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
|
|
elif fetch_by == "run_date":
|
|
df = globals()[fn](sql_engine, run_date)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
|
|
delete_rows(client, table_name, c)
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
else:
|
|
df = globals()[fn](sql_engine)
|
|
check_query=f"EXISTS TABLE {table_name}"
|
|
exists = client.command(check_query)
|
|
if exists == 0 :
|
|
create_clickhouse_table(df, table_name, clickhouse_engine)
|
|
|
|
else:
|
|
truncate_table(client , table_name )
|
|
log.info(f"Truncate a ClickHouse table - {table_name}")
|
|
|
|
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |