14-06-2026 2nd commit
This commit is contained in:
@@ -13,238 +13,262 @@
|
||||
# ///
|
||||
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
# import pyarrow
|
||||
import sys
|
||||
from datetime import date, timedelta, datetime
|
||||
from datetime import date, datetime, timedelta
|
||||
|
||||
import polars as pl
|
||||
import yaml
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine, URL
|
||||
|
||||
import clickhouse_connect
|
||||
|
||||
from log import log
|
||||
|
||||
from clickhouse_task.create_table import create_clickhouse_table
|
||||
from clickhouse_task.delete_task import (
|
||||
delete_existing_data,
|
||||
truncate_table,
|
||||
)
|
||||
|
||||
from clickhouse_task.load_table import load_to_clickhouse
|
||||
|
||||
from db_con.connection import (
|
||||
build_sql_server_engine,
|
||||
build_clickhouse_engine,
|
||||
get_clickhouse_client,
|
||||
)
|
||||
|
||||
from mids import (
|
||||
MID_TABLE_COV,
|
||||
MID_TABLE_COV1,
|
||||
)
|
||||
|
||||
from clickhouse_task.create_table import *
|
||||
from clickhouse_task.delete_task import *
|
||||
from clickhouse_task.load_table import *
|
||||
from db_con.connection import *
|
||||
from mids import *
|
||||
from masters.dimensions import *
|
||||
from masters.bridge import *
|
||||
from kpi.facts import *
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Helpers
|
||||
# ==========================================================
|
||||
|
||||
def table_exists(
|
||||
client,
|
||||
table_name: str,
|
||||
) -> bool:
|
||||
|
||||
return bool(
|
||||
client.command(
|
||||
f"EXISTS TABLE {table_name}"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_dataframe(
|
||||
fn_name: str,
|
||||
fetch_by: str,
|
||||
sql_engine,
|
||||
mids,
|
||||
run_date,
|
||||
):
|
||||
|
||||
fn = globals()[fn_name]
|
||||
|
||||
if fetch_by == "mids":
|
||||
return fn(sql_engine, mids)
|
||||
|
||||
if fetch_by == "run_date":
|
||||
return fn(sql_engine, run_date)
|
||||
|
||||
return fn(sql_engine)
|
||||
|
||||
|
||||
# ==========================================================
|
||||
# Main
|
||||
# ==========================================================
|
||||
|
||||
def main():
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("Hello from data-move Python data pipeline !")
|
||||
log.info("Hello from data-move Python data pipeline!")
|
||||
|
||||
# ------------------------------------------------------
|
||||
# Run Date
|
||||
# ------------------------------------------------------
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
run_date = datetime.strptime(sys.argv[1], "%Y-%m-%d").date()
|
||||
run_date = datetime.strptime(
|
||||
sys.argv[1],
|
||||
"%Y-%m-%d",
|
||||
).date()
|
||||
else:
|
||||
run_date = date.today() - timedelta(days=1)
|
||||
|
||||
log.info(f"Data-pipeline running Date is -:{run_date}")
|
||||
# connecting with both db servers sql-server
|
||||
log.info(
|
||||
"Pipeline Run Date: %s",
|
||||
run_date,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------
|
||||
# Connections
|
||||
# ------------------------------------------------------
|
||||
|
||||
log.info(
|
||||
"Connecting to databases..."
|
||||
)
|
||||
|
||||
|
||||
log.info("connecting with both db servers sql-serveras well as clickhouse DB")
|
||||
|
||||
sql_engine = build_sql_server_engine()
|
||||
clickhouse_engine = build_clickhouse_engine()
|
||||
client=get_clickhouse_client()
|
||||
client = get_clickhouse_client()
|
||||
|
||||
log.info("Both databases connected successfully")
|
||||
log.info(
|
||||
"Database connections established"
|
||||
)
|
||||
|
||||
mids = MID_TABLE_COV(sql_engine, run_date)
|
||||
# ------------------------------------------------------
|
||||
# Delete Keys
|
||||
# ------------------------------------------------------
|
||||
|
||||
mids = MID_TABLE_COV(
|
||||
sql_engine,
|
||||
run_date,
|
||||
)
|
||||
|
||||
emp_visit_df = MID_TABLE_COV1(
|
||||
sql_engine,
|
||||
run_date
|
||||
run_date,
|
||||
)
|
||||
|
||||
delete_existing_data(
|
||||
client=client,
|
||||
run_date=run_date,
|
||||
mids=mids,
|
||||
emp_visit_df=emp_visit_df,
|
||||
)
|
||||
# ------------------------------------------------------
|
||||
# Config
|
||||
# ------------------------------------------------------
|
||||
|
||||
with open(
|
||||
"t.yml",
|
||||
"r",
|
||||
) as file:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
mid_list = ",".join(map(str, mids))
|
||||
conditions = {
|
||||
"mids": f"MID IN ({mid_list})",
|
||||
|
||||
"j_plan": (
|
||||
f"MONTH(VisitDate) = {run_date.month} "
|
||||
f"AND YEAR(VisitDate) = {run_date.year}"
|
||||
),
|
||||
|
||||
"mapping": (
|
||||
f"CAST(Z.FromDate AS DATE) <= '{run_date}' "
|
||||
f"AND CAST(Z.ToDate AS DATE) >= '{run_date}'"
|
||||
),
|
||||
|
||||
"web": (
|
||||
f"CAST(login_date AS DATE) = '{run_date}'"
|
||||
),
|
||||
|
||||
"none": None,
|
||||
}
|
||||
|
||||
|
||||
|
||||
# fetching polar df from sql-server
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
with open("tables.yml", "r") as file:
|
||||
config = yaml.safe_load(file)
|
||||
|
||||
for table in config["tables"]:
|
||||
|
||||
table_name=table["name"]
|
||||
table_type=table["type"]
|
||||
operation=table["operation"]
|
||||
condition=table["condition"]
|
||||
|
||||
c = conditions.get(condition)
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("TABLE=%s | TYPE=%s | OPERATION=%s",
|
||||
table_name,
|
||||
table_type,
|
||||
operation)
|
||||
|
||||
# ------------------------------------------------------
|
||||
# Process Tables
|
||||
# ------------------------------------------------------
|
||||
|
||||
fn=f"fetch_{table_name}"
|
||||
for table in config["tables"]:
|
||||
|
||||
table_name = table["name"]
|
||||
operation = table["operation"]
|
||||
fetch_by = table["fetch_by"]
|
||||
|
||||
if operation == "DELETE+INSERT" :
|
||||
if fetch_by == "mids":
|
||||
df = globals()[fn](sql_engine, mids)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
log.info("=" * 80)
|
||||
log.info(
|
||||
"Processing Table: %s",
|
||||
table_name,
|
||||
)
|
||||
|
||||
else:
|
||||
truncate_table(client , table_name )
|
||||
log.info(f"Truncate a ClickHouse table - {table_name}")
|
||||
try:
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
# ------------------------------------------
|
||||
# Fetch Data
|
||||
# ------------------------------------------
|
||||
|
||||
elif fetch_by == "run_date":
|
||||
df = globals()[fn](sql_engine, run_date)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
fn_name = f"fetch_{table_name}"
|
||||
|
||||
else:
|
||||
truncate_table(client , table_name )
|
||||
log.info(f"Truncate a ClickHouse table - {table_name}")
|
||||
df = get_dataframe(
|
||||
fn_name=fn_name,
|
||||
fetch_by=fetch_by,
|
||||
sql_engine=sql_engine,
|
||||
mids=mids,
|
||||
run_date=run_date,
|
||||
)
|
||||
|
||||
if df.is_empty():
|
||||
|
||||
log.warning(
|
||||
"%s returned no rows",
|
||||
table_name,
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
log.info(
|
||||
"Fetched %s rows",
|
||||
len(df),
|
||||
)
|
||||
|
||||
# ------------------------------------------
|
||||
# Create Table If Missing
|
||||
# ------------------------------------------
|
||||
|
||||
exists = table_exists(
|
||||
client,
|
||||
table_name,
|
||||
)
|
||||
|
||||
if not exists:
|
||||
|
||||
log.info(
|
||||
"Creating table %s",
|
||||
table_name,
|
||||
)
|
||||
|
||||
create_clickhouse_table(
|
||||
df=df,
|
||||
table_name=table_name,
|
||||
clickhouse_engine=clickhouse_engine,
|
||||
)
|
||||
|
||||
# ------------------------------------------
|
||||
# Existing Table Logic
|
||||
# ------------------------------------------
|
||||
|
||||
else:
|
||||
|
||||
if operation == "DELETE+INSERT":
|
||||
|
||||
truncate_table(
|
||||
client,
|
||||
table_name,
|
||||
)
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
else:
|
||||
df = globals()[fn](sql_engine)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
|
||||
else:
|
||||
truncate_table(client , table_name )
|
||||
log.info(f"Truncate a ClickHouse table - {table_name}")
|
||||
delete_existing_data(
|
||||
client=client,
|
||||
table_name=table_name,
|
||||
run_date=run_date,
|
||||
mids=mids,
|
||||
emp_visit_df=emp_visit_df,
|
||||
)
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
else:
|
||||
if fetch_by == "mids":
|
||||
df = globals()[fn](sql_engine, mids)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
# ------------------------------------------
|
||||
# Load Data
|
||||
# ------------------------------------------
|
||||
|
||||
else:
|
||||
load_to_clickhouse(
|
||||
client=client,
|
||||
table_name=table_name,
|
||||
df=df,
|
||||
)
|
||||
|
||||
delete_rows(client, table_name, c )
|
||||
log.info(
|
||||
"%s loaded successfully (%s rows)",
|
||||
table_name,
|
||||
len(df),
|
||||
)
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
|
||||
elif fetch_by == "run_date":
|
||||
df = globals()[fn](sql_engine, run_date)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
|
||||
else:
|
||||
|
||||
delete_rows(client, table_name, c)
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
else:
|
||||
df = globals()[fn](sql_engine)
|
||||
check_query=f"EXISTS TABLE {table_name}"
|
||||
exists = client.command(check_query)
|
||||
if exists == 0 :
|
||||
create_clickhouse_table(df, table_name, clickhouse_engine)
|
||||
|
||||
else:
|
||||
truncate_table(client , table_name )
|
||||
log.info(f"Truncate a ClickHouse table - {table_name}")
|
||||
|
||||
load_to_clickhouse(client=client,table_name=table_name,df=df)
|
||||
except Exception:
|
||||
|
||||
log.exception(
|
||||
"Failed processing table %s",
|
||||
table_name,
|
||||
)
|
||||
|
||||
raise
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("Pipeline Completed Successfully")
|
||||
log.info("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user