final commit
This commit is contained in:
@@ -25,3 +25,7 @@ RUN uv sync --frozen
|
||||
COPY . .
|
||||
|
||||
ENV PATH="/opt/airflow/project/.venv/bin:${PATH}"
|
||||
|
||||
|
||||
# Airflow DAG folder
|
||||
ENV AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dags
|
||||
@@ -2,4 +2,3 @@ pipeline:
|
||||
error_message: null
|
||||
run_date: null
|
||||
status: null
|
||||
last_successful_run_date: '2026-06-22'
|
||||
|
||||
+1
-1
@@ -9,4 +9,4 @@ CH_HOST=172.188.12.194
|
||||
CH_PORT=8123
|
||||
CH_USER=default
|
||||
CH_PASS=dipanshu_k
|
||||
CH_DB=kelloggs_1
|
||||
CH_DB=kelloggs_z
|
||||
@@ -0,0 +1,5 @@
|
||||
pipeline:
|
||||
start_date: '2026-06-01'
|
||||
end_date: '2026-06-20'
|
||||
flag: N
|
||||
Note: ' In flag (yes-:Y and no-:N) '
|
||||
@@ -0,0 +1,3 @@
|
||||
- pipeline_trigeered_on_date: '2026-06-23'
|
||||
failed_run_date: none
|
||||
attempt: none
|
||||
File diff suppressed because it is too large
Load Diff
@@ -52,6 +52,45 @@ from src.dim import *
|
||||
# Helpers
|
||||
# ==========================================================
|
||||
|
||||
def get_dates_from_yaml(filename: str):
|
||||
with open(filename, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
start_date = date.fromisoformat(
|
||||
str(data["pipeline"]["start_date"])
|
||||
)
|
||||
|
||||
end_date = date.fromisoformat(
|
||||
str(data["pipeline"]["end_date"])
|
||||
)
|
||||
flag=str(data["pipeline"]["flag"])
|
||||
|
||||
return start_date, end_date , flag
|
||||
|
||||
|
||||
def write_table_to_yaml(
|
||||
data: dict,
|
||||
run_date: date,
|
||||
filename: str | None = None
|
||||
):
|
||||
"""Write table data to a YAML file."""
|
||||
|
||||
if filename is None:
|
||||
filename = f"elt_pipeline_{run_date}.yml"
|
||||
|
||||
with open(filename, "w") as file:
|
||||
yaml.dump(
|
||||
data,
|
||||
file,
|
||||
default_flow_style=False,
|
||||
sort_keys=False
|
||||
)
|
||||
|
||||
print(f"Table written to {filename}")
|
||||
|
||||
|
||||
|
||||
|
||||
def table_exists(
|
||||
client,
|
||||
table_name: str,
|
||||
@@ -68,7 +107,7 @@ def table_exists(
|
||||
# Main
|
||||
# ==========================================================
|
||||
|
||||
def main():
|
||||
def elt(run_date : date):
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("Hello from data-move Python data pipeline!")
|
||||
@@ -77,13 +116,7 @@ def main():
|
||||
# Run Date
|
||||
# ------------------------------------------------------
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
run_date = datetime.strptime(
|
||||
sys.argv[1],
|
||||
"%Y-%m-%d",
|
||||
).date()
|
||||
else:
|
||||
run_date = date.today() - timedelta(days=1)
|
||||
|
||||
|
||||
log.info(
|
||||
"Pipeline Run Date: %s",
|
||||
@@ -126,7 +159,7 @@ def main():
|
||||
# ------------------------------------------------------
|
||||
|
||||
with open(
|
||||
"t.yml",
|
||||
"y.yml",
|
||||
"r",
|
||||
) as file:
|
||||
|
||||
@@ -211,8 +244,6 @@ def main():
|
||||
table_name,
|
||||
)
|
||||
|
||||
elif operation =="ONLY_INSERT" :
|
||||
continue
|
||||
else:
|
||||
|
||||
delete_existing_data(
|
||||
@@ -253,7 +284,7 @@ def main():
|
||||
log.info("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def main() :
|
||||
|
||||
config_file = Path("Pipeline_config.yml")
|
||||
|
||||
@@ -274,40 +305,96 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
main()
|
||||
p_start_date, p_end_date , flag= get_dates_from_yaml("elt_pipeline_custom_dates.yml")
|
||||
if flag =="Y" :
|
||||
start_date=p_start_date
|
||||
end_date=p_end_date
|
||||
|
||||
with open("Pipeline_config.yml", "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
elif len(sys.argv) > 1:
|
||||
start_date = datetime.strptime(
|
||||
sys.argv[1],
|
||||
"%Y-%m-%d",
|
||||
).date()
|
||||
end_date=start_date + timedelta(days=1)
|
||||
else:
|
||||
start_date = date.today() - timedelta(days=1)
|
||||
end_date=start_date
|
||||
|
||||
config["pipeline"]["last_successful_run_date"] = str(date.today())
|
||||
log.info(
|
||||
"Pipeline Start Date: %s",
|
||||
start_date,
|
||||
)
|
||||
|
||||
with open("Pipeline_config.yml", "w") as f:
|
||||
yaml.safe_dump(config, f, sort_keys=False)
|
||||
|
||||
log.info(
|
||||
f"Pipeline completed successfully. "
|
||||
f"last_successful_run_date={date.today()}"
|
||||
failed_dates=[]
|
||||
successful_dates=[]
|
||||
filename_successful = "successful_Pipeline_dates_config.yml"
|
||||
filename_failed = "failed_Pipeline_dates_config.yml"
|
||||
|
||||
while start_date <=end_date:
|
||||
run_date = start_date
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
elt(run_date)
|
||||
|
||||
successful_dates.append({
|
||||
'pipeline_trigeered_on_date': str(date.today()),
|
||||
'last_successful_run_date': run_date,
|
||||
})
|
||||
|
||||
log.info(
|
||||
f"Pipeline completed successfully. "
|
||||
f"pipeline_trigeered_on_date={date.today()}"
|
||||
f"last_successful_run_date={run_date}"
|
||||
)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
||||
|
||||
|
||||
failed_dates.append({
|
||||
'pipeline_trigeered_on_date': str(date.today()),
|
||||
'failed_run_date': run_date,
|
||||
"attempt" : attempt
|
||||
})
|
||||
|
||||
|
||||
if attempt == 2:
|
||||
raise
|
||||
|
||||
log.warning(
|
||||
f"Pipeline failed. Retry {attempt + 1}/3. Error: {e}"
|
||||
)
|
||||
|
||||
sleep(5)
|
||||
|
||||
|
||||
start_date=start_date + timedelta(days=1)
|
||||
|
||||
|
||||
|
||||
with open(filename_successful, "w") as f:
|
||||
yaml.dump(
|
||||
successful_dates,
|
||||
f,
|
||||
default_flow_style=False,
|
||||
sort_keys=False,
|
||||
)
|
||||
if len(failed_dates) == 0 :
|
||||
failed_dates.append({
|
||||
'pipeline_trigeered_on_date': str(date.today()),
|
||||
'failed_run_date': "none",
|
||||
"attempt" : "none"
|
||||
})
|
||||
with open(filename_failed, "w") as f:
|
||||
yaml.dump(failed_dates,
|
||||
f, default_flow_style=False,
|
||||
sort_keys=False)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
if __name__ == "__main__":
|
||||
|
||||
with open("Pipeline_config.yml", "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
config["pipeline"]["run_date"] = str(date.today())
|
||||
|
||||
with open("Pipeline_config.yml", "w") as f:
|
||||
yaml.safe_dump(config, f, sort_keys=False)
|
||||
|
||||
if attempt == 2:
|
||||
raise
|
||||
|
||||
log.warning(
|
||||
f"Pipeline failed. Retry {attempt + 1}/3. Error: {e}"
|
||||
)
|
||||
|
||||
sleep(5)
|
||||
main()
|
||||
+105
-77
@@ -14,97 +14,125 @@ from db_con.connection import (
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_mapping_storevisibility(
|
||||
sql_engine: Engine,
|
||||
table_name: str,
|
||||
table_type: str,
|
||||
mids: list[int],
|
||||
run_date: date
|
||||
table_type: str,
|
||||
mids: list[int],
|
||||
run_date: date
|
||||
) -> pl.DataFrame:
|
||||
|
||||
run_date = run_date + timedelta(days=1)
|
||||
client= get_clickhouse_client()
|
||||
def table_exists(
|
||||
client,
|
||||
table_name: str,
|
||||
) -> bool:
|
||||
|
||||
return bool(
|
||||
client.command(
|
||||
f"EXISTS TABLE {table_name}"
|
||||
)
|
||||
)
|
||||
def get_reason_ids_mapping_storevisibility(
|
||||
client,
|
||||
run_date: date,
|
||||
table_name: str = "mapping_storevisibility",
|
||||
) -> list[int] :
|
||||
|
||||
if not table_exists(client, table_name):
|
||||
log.warning(f"Table '{table_name}' does not exist. During collecting store_ids")
|
||||
return [0]
|
||||
|
||||
|
||||
query = f"""
|
||||
SELECT DISTINCT StoreId
|
||||
FROM mapping_storevisibility
|
||||
WHERE toDate(Fromdate) <= toDate('{run_date + timedelta(days= 1)}')
|
||||
AND toDate(Todate) >= toDate('{run_date + timedelta(days= 1)}')
|
||||
AND project_Id = '40148'
|
||||
log.info(f"Fetching data from sql server for {table_type} table......")
|
||||
|
||||
"""
|
||||
sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql"
|
||||
|
||||
# ClickHouse -> PyArrow -> Polars
|
||||
arrow_table = client.query_arrow(query)
|
||||
with open(sql_file, "r", encoding="utf-8") as f:
|
||||
sql_template = f.read()
|
||||
|
||||
df= pl.from_arrow(arrow_table)
|
||||
list=df["reason_id"].to_list()
|
||||
return list
|
||||
sql = sql_template.format( )
|
||||
|
||||
def fetch_data(
|
||||
engine: Engine,
|
||||
table_name: str,
|
||||
table_type: str,
|
||||
run_date: date,
|
||||
store_id: list[int]
|
||||
) -> pl.DataFrame:
|
||||
log.info(f"Fetching data from sql server for Master table......")
|
||||
log.info(f"Fetching in progress .... ")
|
||||
|
||||
store_id_list = ",".join(str(sid) for sid in store_id)
|
||||
df = pl.read_database(
|
||||
query=sql,
|
||||
connection=sql_engine
|
||||
)
|
||||
|
||||
sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql"
|
||||
|
||||
with open(sql_file, "r", encoding="utf-8") as f:
|
||||
sql_template = f.read()
|
||||
|
||||
sql = sql_template.format(
|
||||
store_id_list=store_id_list,
|
||||
run_date=run_date.strftime("%Y-%m-%d")
|
||||
|
||||
)
|
||||
|
||||
log.info(f"Fetching in progress .... ")
|
||||
|
||||
df = pl.read_database(
|
||||
query=sql,
|
||||
connection=engine
|
||||
)
|
||||
|
||||
log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
|
||||
return df
|
||||
|
||||
store_id=get_reason_ids_mapping_storevisibility(client, run_date, "mapping_storevisibility")
|
||||
df=fetch_data(engine=sql_engine,
|
||||
table_name=table_name,
|
||||
table_type=table_type,
|
||||
run_date=run_date,
|
||||
store_id=store_id,
|
||||
)
|
||||
log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
# def fetch_mapping_storevisibility(
|
||||
# sql_engine: Engine,
|
||||
# table_name: str,
|
||||
# table_type: str,
|
||||
# mids: list[int],
|
||||
# run_date: date
|
||||
# ) -> pl.DataFrame:
|
||||
|
||||
# run_date = run_date + timedelta(days=1)
|
||||
# client= get_clickhouse_client()
|
||||
# def table_exists(
|
||||
# client,
|
||||
# table_name: str,
|
||||
# ) -> bool:
|
||||
|
||||
# return bool(
|
||||
# client.command(
|
||||
# f"EXISTS TABLE {table_name}"
|
||||
# )
|
||||
# )
|
||||
# def get_reason_ids_mapping_storevisibility(
|
||||
# client,
|
||||
# run_date: date,
|
||||
# table_name: str = "mapping_storevisibility",
|
||||
# ) -> list[int] :
|
||||
|
||||
# if not table_exists(client, table_name):
|
||||
# log.warning(f"Table '{table_name}' does not exist. During collecting store_ids")
|
||||
# return [0]
|
||||
|
||||
|
||||
# query = f"""
|
||||
# SELECT DISTINCT StoreId
|
||||
# FROM mapping_storevisibility
|
||||
# WHERE toDate(Fromdate) <= toDate('{run_date + timedelta(days= 1)}')
|
||||
# AND toDate(Todate) >= toDate('{run_date + timedelta(days= 1)}')
|
||||
# AND project_Id = '40148'
|
||||
|
||||
# """
|
||||
|
||||
# # ClickHouse -> PyArrow -> Polars
|
||||
# arrow_table = client.query_arrow(query)
|
||||
|
||||
# df= pl.from_arrow(arrow_table)
|
||||
# list=df["reason_id"].to_list()
|
||||
# return list
|
||||
|
||||
# def fetch_data(
|
||||
# engine: Engine,
|
||||
# table_name: str,
|
||||
# table_type: str,
|
||||
# run_date: date,
|
||||
# store_id: list[int]
|
||||
# ) -> pl.DataFrame:
|
||||
# log.info(f"Fetching data from sql server for Master table......")
|
||||
|
||||
# store_id_list = ",".join(str(sid) for sid in store_id)
|
||||
|
||||
# sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql"
|
||||
|
||||
# with open(sql_file, "r", encoding="utf-8") as f:
|
||||
# sql_template = f.read()
|
||||
|
||||
# sql = sql_template.format(
|
||||
# store_id_list=store_id_list,
|
||||
# run_date=run_date.strftime("%Y-%m-%d")
|
||||
|
||||
# )
|
||||
|
||||
# log.info(f"Fetching in progress .... ")
|
||||
|
||||
# df = pl.read_database(
|
||||
# query=sql,
|
||||
# connection=engine
|
||||
# )
|
||||
|
||||
# log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
|
||||
# return df
|
||||
|
||||
# store_id=get_reason_ids_mapping_storevisibility(client, run_date, "mapping_storevisibility")
|
||||
# df=fetch_data(engine=sql_engine,
|
||||
# table_name=table_name,
|
||||
# table_type=table_type,
|
||||
# run_date=run_date,
|
||||
# store_id=store_id,
|
||||
# )
|
||||
# log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
+291
-121
@@ -134,6 +134,7 @@ def fetch_additional_visibility( engine: Engine,
|
||||
return df
|
||||
|
||||
|
||||
|
||||
def fetch_OQaD(
|
||||
sql_engine: Engine,
|
||||
table_name: str,
|
||||
@@ -142,166 +143,335 @@ def fetch_OQaD(
|
||||
run_date: date
|
||||
) -> pl.DataFrame:
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# INNER HELPERS (defined once, used below)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
client= get_clickhouse_client()
|
||||
def table_exists(
|
||||
client,
|
||||
table_name: str,
|
||||
) -> bool:
|
||||
client = get_clickhouse_client()
|
||||
|
||||
return bool(
|
||||
client.command(
|
||||
f"EXISTS TABLE {table_name}"
|
||||
)
|
||||
)
|
||||
# ── Does a ClickHouse table exist? ────────────
|
||||
def table_exists(client, table_name: str) -> bool:
|
||||
|
||||
return bool(client.command(f"EXISTS TABLE {table_name}"))
|
||||
|
||||
# ── STEP 1: Who submitted yesterday in SQL Server? ───
|
||||
def fetch_quiz_empids(engine: Engine, run_date: date) -> pl.DataFrame:
|
||||
|
||||
|
||||
# Format date ONCE safely — avoids f-string injection bugs
|
||||
run_date_str = run_date.strftime("%Y-%m-%d")
|
||||
next_date_str = (run_date + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
|
||||
def fetch_quiz_empids(engine: Engine, run_date : date) -> pl.DataFrame:
|
||||
|
||||
sql_template = f"""
|
||||
WITH MID_TABLE_COV1 AS
|
||||
sql = f"""
|
||||
WITH MID_TABLE_COV1 AS
|
||||
(
|
||||
SELECT EmpId, VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE CreateDate >= {run_date}
|
||||
AND CreateDate < DATEADD(DAY,1,'{run_date}')
|
||||
-- Records CREATED yesterday
|
||||
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE CreateDate >= '{run_date_str}'
|
||||
AND CreateDate < '{next_date_str}'
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT EmpId, VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE UpdateDate >= {run_date}
|
||||
AND UpdateDate < DATEADD(DAY,1, '{run_date}')
|
||||
-- Records UPDATED yesterday (different rows, safe to UNION ALL)
|
||||
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE UpdateDate >= '{run_date_str}'
|
||||
AND UpdateDate < '{next_date_str}'
|
||||
),
|
||||
|
||||
QUIZ AS
|
||||
(
|
||||
SELECT Distinct E.EmpId as empid
|
||||
, CONVERT(date,DQ.VisitDate) AS visitdate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD DQ INNER JOIN
|
||||
OneApp_KelloggsMT.dbo.vw_Employee_Detail E ON DQ.EmpId = E.EmpId inner join
|
||||
OneApp_KelloggsMT.dbo.Master_OQAD_Question QU on DQ.QuestionId= qu.QuestionId inner join
|
||||
OneApp_KelloggsMT.dbo.Master_OQAD_Category qc on qu.QuestionCategoryId= qc.QuestionCategoryId
|
||||
where e.EmpName not like 'test%' and e.RightId in (6)
|
||||
and (E.ResignDate is null or E.ResignDate>=''+CONVERT(VARCHAR,'{run_date}')+'') AND E.EmpName NOT LIKE '%TEST%'
|
||||
AND DQ.EmpId IN (SELECT EmpId FROM MID_TABLE_COV1 A WHERE
|
||||
DQ.EmpId=A.EmpId AND CONVERT(date,VisitDate)=CONVERT(date,A.VisitDate) )
|
||||
) select * from quiz
|
||||
"""
|
||||
sql = sql_template.format(
|
||||
run_date=run_date.strftime("%Y-%m-%d")
|
||||
)
|
||||
|
||||
log.info(f"Fetching quiz_empids data for EMPID and Visitid")
|
||||
|
||||
df = pl.read_database(
|
||||
query=sql,
|
||||
connection=engine
|
||||
)
|
||||
|
||||
|
||||
log.info(f"Fetched {len(df):,} total empid and visitdate fetched for OQAD from SQL Server")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_empids_clickhouse_OQAD(
|
||||
client,
|
||||
table_name: str = "OQaD",
|
||||
) -> pl.DataFrame:
|
||||
|
||||
if not table_exists(client, table_name):
|
||||
log.warning(f"Table '{table_name}' does not exist.")
|
||||
return pl.DataFrame(
|
||||
schema={
|
||||
"empid": pl.Int64,
|
||||
"visitdate": pl.Date,
|
||||
}
|
||||
)
|
||||
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
employee_id AS empid,
|
||||
visit_date AS visitdate
|
||||
FROM {table_name}
|
||||
"""
|
||||
E.EmpId AS empid,
|
||||
CAST(DQ.VisitDate AS DATE) AS visitdate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD DQ
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.vw_Employee_Detail E
|
||||
ON DQ.EmpId = E.EmpId
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Question QU
|
||||
ON DQ.QuestionId = QU.QuestionId
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Category QC
|
||||
ON QU.QuestionCategoryId = QC.QuestionCategoryId
|
||||
WHERE E.EmpName NOT LIKE '%TEST%' -- exclude test employees
|
||||
AND E.RightId = 6 -- only field reps
|
||||
AND (
|
||||
E.ResignDate IS NULL
|
||||
OR CAST(E.ResignDate AS DATE) >= '{run_date_str}'
|
||||
)
|
||||
AND EXISTS ( -- ✅ EXISTS beats IN for large sets
|
||||
SELECT 1
|
||||
FROM MID_TABLE_COV1 A
|
||||
WHERE A.EmpId = DQ.EmpId
|
||||
AND A.VisitDate = CAST(DQ.VisitDate AS DATE)
|
||||
)
|
||||
)
|
||||
SELECT * FROM QUIZ
|
||||
"""
|
||||
|
||||
# ClickHouse -> PyArrow -> Polars
|
||||
arrow_table = client.query_arrow(query)
|
||||
log.info("Fetching quiz empids for run_date=%s", run_date_str)
|
||||
df = pl.read_database(query=sql, connection=engine)
|
||||
log.info("Fetched %s (EmpId, VisitDate) pairs from SQL Server", len(df))
|
||||
return df
|
||||
|
||||
return pl.from_arrow(arrow_table)
|
||||
# ── STEP 2: Who do we ALREADY have in ClickHouse? ───
|
||||
def get_empids_clickhouse_OQAD(
|
||||
client,
|
||||
table_name: str = "OQaD",
|
||||
) -> pl.DataFrame:
|
||||
|
||||
|
||||
if not table_exists(client, table_name):
|
||||
log.warning("Table '%s' does not exist in ClickHouse.", table_name)
|
||||
return pl.DataFrame(schema={"empid": pl.Int64, "visitdate": pl.Date})
|
||||
|
||||
qf=fetch_quiz_empids(sql_engine,run_date)
|
||||
db_df = get_empids_clickhouse_OQAD(client)
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
employee_id AS empid,
|
||||
visit_date AS visitdate
|
||||
FROM {table_name}
|
||||
"""
|
||||
|
||||
matched = qf.join(
|
||||
db_df,
|
||||
arrow_table = client.query_arrow(query)
|
||||
df = pl.from_arrow(arrow_table)
|
||||
log.info("Fetched %s existing (EmpId, VisitDate) pairs from ClickHouse", len(df))
|
||||
return df
|
||||
|
||||
# ── STEP 3: Who is NEW? (in SQL Server but NOT yet in ClickHouse) ───
|
||||
def find_new_empids(
|
||||
sql_df: pl.DataFrame,
|
||||
ch_df: pl.DataFrame,
|
||||
) -> list[int]:
|
||||
|
||||
|
||||
new_df = sql_df.join(
|
||||
ch_df,
|
||||
on=["empid", "visitdate"],
|
||||
how="inner",
|
||||
how="anti", # ✅ anti = keep rows NOT found in ch_df
|
||||
)
|
||||
|
||||
if matched.is_empty():
|
||||
if new_df.is_empty():
|
||||
log.warning("No new EmpIds found for table=%s — nothing to fetch.", table_name)
|
||||
return [0] # sentinel value — the .sql WHERE will return 0 rows safely
|
||||
|
||||
empids=[0]
|
||||
log.warning(
|
||||
"%s Matched df in OQaD returned no rows",
|
||||
table_name,
|
||||
)
|
||||
empids = new_df["empid"].unique().to_list()
|
||||
log.info("Found %s NEW empids to fetch for %s", len(empids), table_name)
|
||||
return empids
|
||||
|
||||
else:
|
||||
empids=matched["empid"].to_list()
|
||||
# ── STEP 4: Fetch full quiz data for new empids ───
|
||||
def fetch_data(
|
||||
engine: Engine,
|
||||
table_name: str,
|
||||
table_type: str,
|
||||
empids: list[int],
|
||||
run_date: date,
|
||||
) -> pl.DataFrame:
|
||||
|
||||
|
||||
log.info(f"Fetched {len(empids):,} matched empids fetched for OQAD ")
|
||||
run_date_str = run_date.strftime("%Y-%m-%d")
|
||||
empid_list = ", ".join(str(e) for e in empids) # "101, 102, 103"
|
||||
|
||||
def fetch_data(
|
||||
engine: Engine,
|
||||
table_name: str,
|
||||
table_type: str,
|
||||
empids: list[int],
|
||||
run_date: date
|
||||
) -> pl.DataFrame:
|
||||
sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql"
|
||||
log.info("Loading SQL from: %s (exists=%s)", sql_file.resolve(), sql_file.exists())
|
||||
|
||||
empid_list = ",".join(str(empid) for empid in empids)
|
||||
with open(sql_file, "r", encoding="utf-8") as f:
|
||||
sql_template = f.read()
|
||||
|
||||
sql = sql_template.format(
|
||||
empid_list=empid_list,
|
||||
run_date=run_date_str,
|
||||
)
|
||||
|
||||
log.info("Fetching full OQaD data for %s empids, run_date=%s", len(empids), run_date_str)
|
||||
df = pl.read_database(query=sql, connection=engine)
|
||||
log.info("Fetched %s rows from SQL Server for table=%s", len(df), table_name)
|
||||
return df
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# MAIN FLOW (the 4 steps, clearly sequenced)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
qf = fetch_quiz_empids(sql_engine, run_date) # Step 1
|
||||
db_df = get_empids_clickhouse_OQAD(client, table_name) # Step 2
|
||||
empids = find_new_empids(qf, db_df) # Step 3
|
||||
|
||||
df = fetch_data( # Step 4
|
||||
engine=sql_engine,
|
||||
table_name=table_name,
|
||||
table_type=table_type,
|
||||
empids=empids,
|
||||
run_date=run_date,
|
||||
)
|
||||
|
||||
log.info("fetch_OQaD complete — returning %s rows", len(df))
|
||||
return df
|
||||
|
||||
# def fetch_OQaD(
|
||||
# sql_engine: Engine,
|
||||
# table_name: str,
|
||||
# table_type: str,
|
||||
# mids: list[int],
|
||||
# run_date: date
|
||||
# ) -> pl.DataFrame:
|
||||
|
||||
|
||||
sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql"
|
||||
# client= get_clickhouse_client()
|
||||
# def table_exists(
|
||||
# client,
|
||||
# table_name: str,
|
||||
# ) -> bool:
|
||||
|
||||
log.info(f"Exists: {sql_file.exists()}")
|
||||
log.info(f"Path: {sql_file.resolve()}")
|
||||
# return bool(
|
||||
# client.command(
|
||||
# f"EXISTS TABLE {table_name}"
|
||||
# )
|
||||
# )
|
||||
|
||||
with open(sql_file, "r", encoding="utf-8") as f:
|
||||
sql_template = f.read()
|
||||
|
||||
sql = sql_template.format(
|
||||
empid_list=empid_list,
|
||||
run_date=run_date.strftime("%Y-%m-%d")
|
||||
)
|
||||
|
||||
log.info(f"Fetching data for {len(empids):,} EMPIDs")
|
||||
# def fetch_quiz_empids(engine: Engine, run_date : date) -> pl.DataFrame:
|
||||
|
||||
log.info("Fetching OQaD data for run_date=%s", run_date)
|
||||
# sql_template = f"""
|
||||
# WITH MID_TABLE_COV1 AS
|
||||
# (
|
||||
# SELECT EmpId, VisitDate
|
||||
# FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
# WHERE CreateDate >= {run_date}
|
||||
# AND CreateDate < DATEADD(DAY,1,'{run_date}')
|
||||
|
||||
df = pl.read_database(
|
||||
query=sql,
|
||||
connection=engine,
|
||||
)
|
||||
# UNION
|
||||
|
||||
log.info("fn name is fetch_OQad ------Fetched %s rows", len(df))
|
||||
# SELECT EmpId, VisitDate
|
||||
# FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
# WHERE UpdateDate >= {run_date}
|
||||
# AND UpdateDate < DATEADD(DAY,1, '{run_date}')
|
||||
# ),
|
||||
|
||||
return df
|
||||
df=fetch_data( engine=sql_engine,
|
||||
table_name=table_name,
|
||||
table_type=table_type,
|
||||
empids=empids,
|
||||
run_date=run_date
|
||||
)
|
||||
log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
# QUIZ AS
|
||||
# (
|
||||
# SELECT Distinct E.EmpId as empid
|
||||
# , CONVERT(date,DQ.VisitDate) AS visitdate
|
||||
# FROM OneApp_KelloggsMT.dbo.T_OQAD DQ INNER JOIN
|
||||
# OneApp_KelloggsMT.dbo.vw_Employee_Detail E ON DQ.EmpId = E.EmpId inner join
|
||||
# OneApp_KelloggsMT.dbo.Master_OQAD_Question QU on DQ.QuestionId= qu.QuestionId inner join
|
||||
# OneApp_KelloggsMT.dbo.Master_OQAD_Category qc on qu.QuestionCategoryId= qc.QuestionCategoryId
|
||||
# where e.EmpName not like 'test%' and e.RightId in (6)
|
||||
# and (E.ResignDate is null or E.ResignDate>=''+CONVERT(VARCHAR,'{run_date}')+'') AND E.EmpName NOT LIKE '%TEST%'
|
||||
# AND DQ.EmpId IN (SELECT EmpId FROM MID_TABLE_COV1 A WHERE
|
||||
# DQ.EmpId=A.EmpId AND CONVERT(date,VisitDate)=CONVERT(date,A.VisitDate) )
|
||||
# ) select * from quiz
|
||||
# """
|
||||
# sql = sql_template.format(
|
||||
# run_date=run_date.strftime("%Y-%m-%d")
|
||||
# )
|
||||
|
||||
return df
|
||||
# log.info(f"Fetching quiz_empids data for EMPID and Visitid")
|
||||
|
||||
# df = pl.read_database(
|
||||
# query=sql,
|
||||
# connection=engine
|
||||
# )
|
||||
|
||||
|
||||
# log.info(f"Fetched {len(df):,} total empid and visitdate fetched for OQAD from SQL Server")
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
# def get_empids_clickhouse_OQAD(
|
||||
# client,
|
||||
# table_name: str = "OQaD",
|
||||
# ) -> pl.DataFrame:
|
||||
|
||||
# if not table_exists(client, table_name):
|
||||
# log.warning(f"Table '{table_name}' does not exist.")
|
||||
# return pl.DataFrame(
|
||||
# schema={
|
||||
# "empid": pl.Int64,
|
||||
# "visitdate": pl.Date,
|
||||
# }
|
||||
# )
|
||||
|
||||
# query = f"""
|
||||
# SELECT DISTINCT
|
||||
# employee_id AS empid,
|
||||
# visit_date AS visitdate
|
||||
# FROM {table_name}
|
||||
# """
|
||||
|
||||
# # ClickHouse -> PyArrow -> Polars
|
||||
# arrow_table = client.query_arrow(query)
|
||||
|
||||
# return pl.from_arrow(arrow_table)
|
||||
|
||||
|
||||
|
||||
# qf=fetch_quiz_empids(sql_engine,run_date)
|
||||
# db_df = get_empids_clickhouse_OQAD(client)
|
||||
|
||||
# matched = qf.join(
|
||||
# db_df,
|
||||
# on=["empid", "visitdate"],
|
||||
# how="inner",
|
||||
# )
|
||||
|
||||
# if matched.is_empty():
|
||||
|
||||
# empids=[0]
|
||||
# log.warning(
|
||||
# "%s Matched df in OQaD returned no rows",
|
||||
# table_name,
|
||||
# )
|
||||
|
||||
# else:
|
||||
# empids=matched["empid"].to_list()
|
||||
|
||||
|
||||
# log.info(f"Fetched {len(empids):,} matched empids fetched for OQAD ")
|
||||
|
||||
# def fetch_data(
|
||||
# engine: Engine,
|
||||
# table_name: str,
|
||||
# table_type: str,
|
||||
# empids: list[int],
|
||||
# run_date: date
|
||||
# ) -> pl.DataFrame:
|
||||
|
||||
# empid_list = ",".join(str(empid) for empid in empids)
|
||||
|
||||
|
||||
# sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql"
|
||||
|
||||
# log.info(f"Exists: {sql_file.exists()}")
|
||||
# log.info(f"Path: {sql_file.resolve()}")
|
||||
|
||||
# with open(sql_file, "r", encoding="utf-8") as f:
|
||||
# sql_template = f.read()
|
||||
|
||||
# sql = sql_template.format(
|
||||
# empid_list=empid_list,
|
||||
# run_date=run_date.strftime("%Y-%m-%d")
|
||||
# )
|
||||
|
||||
# log.info(f"Fetching data for {len(empids):,} EMPIDs")
|
||||
|
||||
# log.info("Fetching OQaD data for run_date=%s", run_date)
|
||||
|
||||
# df = pl.read_database(
|
||||
# query=sql,
|
||||
# connection=engine,
|
||||
# )
|
||||
|
||||
# log.info("fn name is fetch_OQad ------Fetched %s rows", len(df))
|
||||
|
||||
# return df
|
||||
# df=fetch_data( engine=sql_engine,
|
||||
# table_name=table_name,
|
||||
# table_type=table_type,
|
||||
# empids=empids,
|
||||
# run_date=run_date
|
||||
# )
|
||||
# log.info(f"Fetched {len(df):,} rows from SQL Server")
|
||||
|
||||
# return df
|
||||
|
||||
|
||||
# def fetch_OQaD(
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
|
||||
|
||||
|
||||
with mapping_storevisibility
|
||||
(Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,CreateDate,CreateBy)
|
||||
AS (
|
||||
select DISTINCT '40148' as Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,getdate(),'SP-Pius'
|
||||
FROM OneApp_KelloggsMT.dbo.mapping_storevisibility z WHERE
|
||||
convert(date,FROMDATE,101)<=convert(Date,getdate(),101) AND CONVERT(DATE,ToDate,101)>=convert(Date,getdate(),101)
|
||||
AND z.VisibilityDefinitionid IN
|
||||
(SELECT DISTINCT VisibilityDefinitionid FROM OneApp_KelloggsMT.dbo.MASTER_VISIBILITYDEFINITION WHERE MENUID=22 )
|
||||
AND z.StoreId NOT IN ({store_id_list})
|
||||
FROM OneApp_KelloggsMT.dbo.mapping_storevisibility
|
||||
)
|
||||
select * from mapping_storevisibility
|
||||
+48
-53
@@ -1,19 +1,19 @@
|
||||
WITH MID_TABLE_COV1 AS
|
||||
(
|
||||
SELECT EmpId, VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE CreateDate >= {run_date}
|
||||
AND CreateDate < DATEADD(DAY,1,'{run_date}')
|
||||
|
||||
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE CreateDate >= '{run_date}'
|
||||
AND CreateDate < DATEADD(DAY, 1, '{run_date}')
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT EmpId, VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE UpdateDate >= {run_date}
|
||||
AND UpdateDate < DATEADD(DAY,1, '{run_date}')
|
||||
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD
|
||||
WHERE UpdateDate >= '{run_date}'
|
||||
AND UpdateDate < DATEADD(DAY, 1, '{run_date}')
|
||||
),
|
||||
|
||||
|
||||
QUIZ AS
|
||||
(
|
||||
SELECT DISTINCT
|
||||
@@ -25,57 +25,52 @@ QUIZ AS
|
||||
E.CityName,
|
||||
E.StateName,
|
||||
E.RegionName,
|
||||
CAST(DQ.VisitDate AS DATE) AS VisitDate,
|
||||
CAST(DQ.VisitDate AS DATE) AS VisitDate,
|
||||
DQ.QuestionId,
|
||||
DQ.AnswerId,
|
||||
QC.QuestionCategoryId,
|
||||
QC.QuestionCategory
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD DQ
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.vw_Employee_Detail E
|
||||
ON DQ.EmpId = E.EmpId
|
||||
FROM OneApp_KelloggsMT.dbo.T_OQAD DQ
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.vw_Employee_Detail E
|
||||
ON DQ.EmpId = E.EmpId
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Question QU
|
||||
ON DQ.QuestionId = QU.QuestionId
|
||||
ON DQ.QuestionId = QU.QuestionId
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Category QC
|
||||
ON QU.QuestionCategoryId = QC.QuestionCategoryId
|
||||
WHERE E.EmpName NOT LIKE '%TEST%'
|
||||
AND E.RightId = 6
|
||||
AND (
|
||||
E.ResignDate IS NULL
|
||||
OR CAST(E.ResignDate AS DATE) >= '{run_date}'
|
||||
)
|
||||
AND EXISTS
|
||||
(
|
||||
SELECT 1
|
||||
FROM MID_TABLE_COV1 A
|
||||
WHERE A.EmpId = DQ.EmpId
|
||||
AND CAST(A.VisitDate AS DATE) = CAST(DQ.VisitDate AS DATE)
|
||||
)
|
||||
ON QU.QuestionCategoryId = QC.QuestionCategoryId
|
||||
WHERE E.EmpName NOT LIKE '%TEST%'
|
||||
AND E.RightId = 6
|
||||
AND (E.ResignDate IS NULL OR CAST(E.ResignDate AS DATE) >= '{run_date}')
|
||||
AND EXISTS (
|
||||
SELECT 1
|
||||
FROM MID_TABLE_COV1 A
|
||||
WHERE A.EmpId = DQ.EmpId
|
||||
AND A.VisitDate = CAST(DQ.VisitDate AS DATE)
|
||||
)
|
||||
-- ✅ Exclude EmpIds already loaded into ClickHouse
|
||||
AND E.EmpId NOT IN ({empid_list})
|
||||
)
|
||||
|
||||
SELECT
|
||||
40148 AS project_id,
|
||||
Q.EmpId AS employee_id,
|
||||
0 AS process_id,
|
||||
Q.VisitDate AS visit_date,
|
||||
Q.QuestionCategoryId AS question_category_id,
|
||||
Q.QuestionCategory AS question_category,
|
||||
QM.QuestionId AS question_id,
|
||||
QM.Question AS question,
|
||||
ISNULL(QA.AnswerId,0) AS answer_id,
|
||||
ISNULL(QA.Answer,'') AS answer,
|
||||
CASE
|
||||
WHEN QA.AnswerId IS NULL THEN 'Not Answer'
|
||||
WHEN QA.RightAnswer = 1 THEN 'Y'
|
||||
WHEN QA.RightAnswer IS NULL THEN 'Not Answer'
|
||||
ELSE 'N'
|
||||
END AS correct_answer,
|
||||
GETDATE() AS update_date,
|
||||
'SP-Pius' AS update_by
|
||||
FROM QUIZ Q
|
||||
40148 AS project_id,
|
||||
Q.EmpId AS employee_id,
|
||||
0 AS process_id,
|
||||
Q.VisitDate AS visit_date,
|
||||
Q.QuestionCategoryId AS question_category_id,
|
||||
Q.QuestionCategory AS question_category,
|
||||
QM.QuestionId AS question_id,
|
||||
QM.Question AS question,
|
||||
ISNULL(QA.AnswerId, 0) AS answer_id,
|
||||
ISNULL(QA.Answer, '') AS answer,
|
||||
CASE
|
||||
WHEN QA.AnswerId IS NULL THEN 'Not Answer'
|
||||
WHEN QA.RightAnswer = 1 THEN 'Y'
|
||||
WHEN QA.RightAnswer IS NULL THEN 'Not Answer'
|
||||
ELSE 'N'
|
||||
END AS correct_answer,
|
||||
GETDATE() AS update_date,
|
||||
'SP-Pius' AS update_by
|
||||
FROM QUIZ Q
|
||||
INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Question QM
|
||||
ON Q.QuestionId = QM.QuestionId
|
||||
LEFT JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Answer QA
|
||||
ON Q.AnswerId = QA.AnswerId
|
||||
where Q.EmpId not in ({empid_list})
|
||||
|
||||
|
||||
ON Q.QuestionId = QM.QuestionId
|
||||
LEFT JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Answer QA
|
||||
ON Q.AnswerId = QA.AnswerId
|
||||
@@ -0,0 +1,2 @@
|
||||
- pipeline_trigeered_on_date: '2026-06-23'
|
||||
last_successful_run_date: 2026-06-22
|
||||
@@ -4,11 +4,6 @@ tables:
|
||||
operation: INSERT
|
||||
fetch_by: mids
|
||||
|
||||
- name: OQaD
|
||||
type: FACT
|
||||
operation: INSERT
|
||||
fetch_by: run_date
|
||||
|
||||
- name: Survey
|
||||
type: FACT
|
||||
operation: INSERT
|
||||
@@ -87,7 +82,7 @@ tables:
|
||||
|
||||
- name: mapping_storevisibility
|
||||
type: BRIDGE
|
||||
operation: ONLY_INSERT
|
||||
operation: DELETE+INSERT
|
||||
fetch_by: run_date
|
||||
|
||||
- name: Master_VisibilityReason
|
||||
@@ -105,3 +100,9 @@ tables:
|
||||
type: DIMENSION
|
||||
operation: DELETE+INSERT
|
||||
fetch_by: none
|
||||
|
||||
|
||||
- name: OQaD
|
||||
type: FACT
|
||||
operation: INSERT
|
||||
fetch_by: run_date
|
||||
@@ -4,10 +4,10 @@ tables:
|
||||
# operation: INSERT
|
||||
# fetch_by: mids
|
||||
|
||||
# # - name: OQaD
|
||||
# # type: FACT
|
||||
# # operation: INSERT
|
||||
# # fetch_by: run_date
|
||||
- name: OQaD
|
||||
type: FACT
|
||||
operation: INSERT
|
||||
fetch_by: run_date
|
||||
|
||||
# - name: additional_visibility
|
||||
# type: FACT
|
||||
@@ -102,8 +102,8 @@ tables:
|
||||
# operation: DELETE+INSERT
|
||||
# fetch_by: none
|
||||
|
||||
- name: Promotion
|
||||
type: FACT
|
||||
operation: INSERT
|
||||
fetch_by: mids
|
||||
# - name: Promotion
|
||||
# type: FACT
|
||||
# operation: INSERT
|
||||
# fetch_by: mids
|
||||
|
||||
|
||||
Reference in New Issue
Block a user