final commit

This commit is contained in:
Ankit Malik
2026-06-23 18:23:58 +05:30
parent e218aafc26
commit 6b2d754981
15 changed files with 2803 additions and 323 deletions
+4
View File
@@ -25,3 +25,7 @@ RUN uv sync --frozen
COPY . . COPY . .
ENV PATH="/opt/airflow/project/.venv/bin:${PATH}" ENV PATH="/opt/airflow/project/.venv/bin:${PATH}"
# Airflow DAG folder
ENV AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dags
-1
View File
@@ -2,4 +2,3 @@ pipeline:
error_message: null error_message: null
run_date: null run_date: null
status: null status: null
last_successful_run_date: '2026-06-22'
+1 -1
View File
@@ -9,4 +9,4 @@ CH_HOST=172.188.12.194
CH_PORT=8123 CH_PORT=8123
CH_USER=default CH_USER=default
CH_PASS=dipanshu_k CH_PASS=dipanshu_k
CH_DB=kelloggs_1 CH_DB=kelloggs_z
+5
View File
@@ -0,0 +1,5 @@
pipeline:
start_date: '2026-06-01'
end_date: '2026-06-20'
flag: N
Note: ' In flag (yes-:Y and no-:N) '
+3
View File
@@ -0,0 +1,3 @@
- pipeline_trigeered_on_date: '2026-06-23'
failed_run_date: none
attempt: none
File diff suppressed because it is too large Load Diff
+113 -26
View File
@@ -52,6 +52,45 @@ from src.dim import *
# Helpers # Helpers
# ========================================================== # ==========================================================
def get_dates_from_yaml(filename: str):
with open(filename, "r") as file:
data = yaml.safe_load(file)
start_date = date.fromisoformat(
str(data["pipeline"]["start_date"])
)
end_date = date.fromisoformat(
str(data["pipeline"]["end_date"])
)
flag=str(data["pipeline"]["flag"])
return start_date, end_date , flag
def write_table_to_yaml(
data: dict,
run_date: date,
filename: str | None = None
):
"""Write table data to a YAML file."""
if filename is None:
filename = f"elt_pipeline_{run_date}.yml"
with open(filename, "w") as file:
yaml.dump(
data,
file,
default_flow_style=False,
sort_keys=False
)
print(f"Table written to {filename}")
def table_exists( def table_exists(
client, client,
table_name: str, table_name: str,
@@ -68,7 +107,7 @@ def table_exists(
# Main # Main
# ========================================================== # ==========================================================
def main(): def elt(run_date : date):
log.info("=" * 80) log.info("=" * 80)
log.info("Hello from data-move Python data pipeline!") log.info("Hello from data-move Python data pipeline!")
@@ -77,13 +116,7 @@ def main():
# Run Date # Run Date
# ------------------------------------------------------ # ------------------------------------------------------
if len(sys.argv) > 1:
run_date = datetime.strptime(
sys.argv[1],
"%Y-%m-%d",
).date()
else:
run_date = date.today() - timedelta(days=1)
log.info( log.info(
"Pipeline Run Date: %s", "Pipeline Run Date: %s",
@@ -126,7 +159,7 @@ def main():
# ------------------------------------------------------ # ------------------------------------------------------
with open( with open(
"t.yml", "y.yml",
"r", "r",
) as file: ) as file:
@@ -211,8 +244,6 @@ def main():
table_name, table_name,
) )
elif operation =="ONLY_INSERT" :
continue
else: else:
delete_existing_data( delete_existing_data(
@@ -253,7 +284,7 @@ def main():
log.info("=" * 80) log.info("=" * 80)
if __name__ == "__main__": def main() :
config_file = Path("Pipeline_config.yml") config_file = Path("Pipeline_config.yml")
@@ -274,34 +305,62 @@ if __name__ == "__main__":
p_start_date, p_end_date , flag= get_dates_from_yaml("elt_pipeline_custom_dates.yml")
if flag =="Y" :
start_date=p_start_date
end_date=p_end_date
elif len(sys.argv) > 1:
start_date = datetime.strptime(
sys.argv[1],
"%Y-%m-%d",
).date()
end_date=start_date + timedelta(days=1)
else:
start_date = date.today() - timedelta(days=1)
end_date=start_date
log.info(
"Pipeline Start Date: %s",
start_date,
)
failed_dates=[]
successful_dates=[]
filename_successful = "successful_Pipeline_dates_config.yml"
filename_failed = "failed_Pipeline_dates_config.yml"
while start_date <=end_date:
run_date = start_date
for attempt in range(3): for attempt in range(3):
try: try:
main() elt(run_date)
with open("Pipeline_config.yml", "r") as f: successful_dates.append({
config = yaml.safe_load(f) 'pipeline_trigeered_on_date': str(date.today()),
'last_successful_run_date': run_date,
config["pipeline"]["last_successful_run_date"] = str(date.today()) })
with open("Pipeline_config.yml", "w") as f:
yaml.safe_dump(config, f, sort_keys=False)
log.info( log.info(
f"Pipeline completed successfully. " f"Pipeline completed successfully. "
f"last_successful_run_date={date.today()}" f"pipeline_trigeered_on_date={date.today()}"
f"last_successful_run_date={run_date}"
) )
break break
except Exception as e: except Exception as e:
with open("Pipeline_config.yml", "r") as f:
config = yaml.safe_load(f)
config["pipeline"]["run_date"] = str(date.today())
with open("Pipeline_config.yml", "w") as f: failed_dates.append({
yaml.safe_dump(config, f, sort_keys=False) 'pipeline_trigeered_on_date': str(date.today()),
'failed_run_date': run_date,
"attempt" : attempt
})
if attempt == 2: if attempt == 2:
raise raise
@@ -311,3 +370,31 @@ if __name__ == "__main__":
) )
sleep(5) sleep(5)
start_date=start_date + timedelta(days=1)
with open(filename_successful, "w") as f:
yaml.dump(
successful_dates,
f,
default_flow_style=False,
sort_keys=False,
)
if len(failed_dates) == 0 :
failed_dates.append({
'pipeline_trigeered_on_date': str(date.today()),
'failed_run_date': "none",
"attempt" : "none"
})
with open(filename_failed, "w") as f:
yaml.dump(failed_dates,
f, default_flow_style=False,
sort_keys=False)
if __name__ == "__main__":
main()
View File
+93 -65
View File
@@ -14,9 +14,6 @@ from db_con.connection import (
) )
def fetch_mapping_storevisibility( def fetch_mapping_storevisibility(
sql_engine: Engine, sql_engine: Engine,
table_name: str, table_name: str,
@@ -25,86 +22,117 @@ def fetch_mapping_storevisibility(
run_date: date run_date: date
) -> pl.DataFrame: ) -> pl.DataFrame:
run_date = run_date + timedelta(days=1)
client= get_clickhouse_client()
def table_exists(
client,
table_name: str,
) -> bool:
return bool(
client.command(
f"EXISTS TABLE {table_name}"
)
)
def get_reason_ids_mapping_storevisibility(
client,
run_date: date,
table_name: str = "mapping_storevisibility",
) -> list[int] :
if not table_exists(client, table_name):
log.warning(f"Table '{table_name}' does not exist. During collecting store_ids")
return [0]
query = f""" log.info(f"Fetching data from sql server for {table_type} table......")
SELECT DISTINCT StoreId
FROM mapping_storevisibility
WHERE toDate(Fromdate) <= toDate('{run_date + timedelta(days= 1)}')
AND toDate(Todate) >= toDate('{run_date + timedelta(days= 1)}')
AND project_Id = '40148'
"""
# ClickHouse -> PyArrow -> Polars
arrow_table = client.query_arrow(query)
df= pl.from_arrow(arrow_table)
list=df["reason_id"].to_list()
return list
def fetch_data(
engine: Engine,
table_name: str,
table_type: str,
run_date: date,
store_id: list[int]
) -> pl.DataFrame:
log.info(f"Fetching data from sql server for Master table......")
store_id_list = ",".join(str(sid) for sid in store_id)
sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql" sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql"
with open(sql_file, "r", encoding="utf-8") as f: with open(sql_file, "r", encoding="utf-8") as f:
sql_template = f.read() sql_template = f.read()
sql = sql_template.format( sql = sql_template.format( )
store_id_list=store_id_list,
run_date=run_date.strftime("%Y-%m-%d")
)
log.info(f"Fetching in progress .... ") log.info(f"Fetching in progress .... ")
df = pl.read_database( df = pl.read_database(
query=sql, query=sql,
connection=engine connection=sql_engine
) )
log.info(f"Fetched {len(df):,} rows from SQL Server") log.info(f"Fetched {len(df):,} rows from SQL Server")
return df return df
store_id=get_reason_ids_mapping_storevisibility(client, run_date, "mapping_storevisibility")
df=fetch_data(engine=sql_engine,
table_name=table_name,
table_type=table_type,
run_date=run_date,
store_id=store_id,
)
log.info(f"Fetched {len(df):,} rows from SQL Server")
return df # def fetch_mapping_storevisibility(
# sql_engine: Engine,
# table_name: str,
# table_type: str,
# mids: list[int],
# run_date: date
# ) -> pl.DataFrame:
# run_date = run_date + timedelta(days=1)
# client= get_clickhouse_client()
# def table_exists(
# client,
# table_name: str,
# ) -> bool:
# return bool(
# client.command(
# f"EXISTS TABLE {table_name}"
# )
# )
# def get_reason_ids_mapping_storevisibility(
# client,
# run_date: date,
# table_name: str = "mapping_storevisibility",
# ) -> list[int] :
# if not table_exists(client, table_name):
# log.warning(f"Table '{table_name}' does not exist. During collecting store_ids")
# return [0]
# query = f"""
# SELECT DISTINCT StoreId
# FROM mapping_storevisibility
# WHERE toDate(Fromdate) <= toDate('{run_date + timedelta(days= 1)}')
# AND toDate(Todate) >= toDate('{run_date + timedelta(days= 1)}')
# AND project_Id = '40148'
# """
# # ClickHouse -> PyArrow -> Polars
# arrow_table = client.query_arrow(query)
# df= pl.from_arrow(arrow_table)
# list=df["reason_id"].to_list()
# return list
# def fetch_data(
# engine: Engine,
# table_name: str,
# table_type: str,
# run_date: date,
# store_id: list[int]
# ) -> pl.DataFrame:
# log.info(f"Fetching data from sql server for Master table......")
# store_id_list = ",".join(str(sid) for sid in store_id)
# sql_file = Path("src") / "sql" / f"bridge" / f"{table_name}.sql"
# with open(sql_file, "r", encoding="utf-8") as f:
# sql_template = f.read()
# sql = sql_template.format(
# store_id_list=store_id_list,
# run_date=run_date.strftime("%Y-%m-%d")
# )
# log.info(f"Fetching in progress .... ")
# df = pl.read_database(
# query=sql,
# connection=engine
# )
# log.info(f"Fetched {len(df):,} rows from SQL Server")
# return df
# store_id=get_reason_ids_mapping_storevisibility(client, run_date, "mapping_storevisibility")
# df=fetch_data(engine=sql_engine,
# table_name=table_name,
# table_type=table_type,
# run_date=run_date,
# store_id=store_id,
# )
# log.info(f"Fetched {len(df):,} rows from SQL Server")
# return df
+263 -93
View File
@@ -134,6 +134,7 @@ def fetch_additional_visibility( engine: Engine,
return df return df
def fetch_OQaD( def fetch_OQaD(
sql_engine: Engine, sql_engine: Engine,
table_name: str, table_name: str,
@@ -142,83 +143,85 @@ def fetch_OQaD(
run_date: date run_date: date
) -> pl.DataFrame: ) -> pl.DataFrame:
# ─────────────────────────────────────────────
# INNER HELPERS (defined once, used below)
# ─────────────────────────────────────────────
client= get_clickhouse_client() client = get_clickhouse_client()
def table_exists(
client,
table_name: str,
) -> bool:
return bool( # ── Does a ClickHouse table exist? ────────────
client.command( def table_exists(client, table_name: str) -> bool:
f"EXISTS TABLE {table_name}"
) return bool(client.command(f"EXISTS TABLE {table_name}"))
)
# ── STEP 1: Who submitted yesterday in SQL Server? ───
def fetch_quiz_empids(engine: Engine, run_date: date) -> pl.DataFrame:
# Format date ONCE safely — avoids f-string injection bugs
run_date_str = run_date.strftime("%Y-%m-%d")
next_date_str = (run_date + timedelta(days=1)).strftime("%Y-%m-%d")
def fetch_quiz_empids(engine: Engine, run_date : date) -> pl.DataFrame: sql = f"""
sql_template = f"""
WITH MID_TABLE_COV1 AS WITH MID_TABLE_COV1 AS
( (
SELECT EmpId, VisitDate -- Records CREATED yesterday
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
FROM OneApp_KelloggsMT.dbo.T_OQAD FROM OneApp_KelloggsMT.dbo.T_OQAD
WHERE CreateDate >= {run_date} WHERE CreateDate >= '{run_date_str}'
AND CreateDate < DATEADD(DAY,1,'{run_date}') AND CreateDate < '{next_date_str}'
UNION ALL UNION ALL
SELECT EmpId, VisitDate -- Records UPDATED yesterday (different rows, safe to UNION ALL)
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
FROM OneApp_KelloggsMT.dbo.T_OQAD FROM OneApp_KelloggsMT.dbo.T_OQAD
WHERE UpdateDate >= {run_date} WHERE UpdateDate >= '{run_date_str}'
AND UpdateDate < DATEADD(DAY,1, '{run_date}') AND UpdateDate < '{next_date_str}'
), ),
QUIZ AS QUIZ AS
( (
SELECT Distinct E.EmpId as empid SELECT DISTINCT
, CONVERT(date,DQ.VisitDate) AS visitdate E.EmpId AS empid,
FROM OneApp_KelloggsMT.dbo.T_OQAD DQ INNER JOIN CAST(DQ.VisitDate AS DATE) AS visitdate
OneApp_KelloggsMT.dbo.vw_Employee_Detail E ON DQ.EmpId = E.EmpId inner join FROM OneApp_KelloggsMT.dbo.T_OQAD DQ
OneApp_KelloggsMT.dbo.Master_OQAD_Question QU on DQ.QuestionId= qu.QuestionId inner join INNER JOIN OneApp_KelloggsMT.dbo.vw_Employee_Detail E
OneApp_KelloggsMT.dbo.Master_OQAD_Category qc on qu.QuestionCategoryId= qc.QuestionCategoryId ON DQ.EmpId = E.EmpId
where e.EmpName not like 'test%' and e.RightId in (6) INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Question QU
and (E.ResignDate is null or E.ResignDate>=''+CONVERT(VARCHAR,'{run_date}')+'') AND E.EmpName NOT LIKE '%TEST%' ON DQ.QuestionId = QU.QuestionId
AND DQ.EmpId IN (SELECT EmpId FROM MID_TABLE_COV1 A WHERE INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Category QC
DQ.EmpId=A.EmpId AND CONVERT(date,VisitDate)=CONVERT(date,A.VisitDate) ) ON QU.QuestionCategoryId = QC.QuestionCategoryId
) select * from quiz WHERE E.EmpName NOT LIKE '%TEST%' -- exclude test employees
AND E.RightId = 6 -- only field reps
AND (
E.ResignDate IS NULL
OR CAST(E.ResignDate AS DATE) >= '{run_date_str}'
)
AND EXISTS ( -- ✅ EXISTS beats IN for large sets
SELECT 1
FROM MID_TABLE_COV1 A
WHERE A.EmpId = DQ.EmpId
AND A.VisitDate = CAST(DQ.VisitDate AS DATE)
)
)
SELECT * FROM QUIZ
""" """
sql = sql_template.format(
run_date=run_date.strftime("%Y-%m-%d")
)
log.info(f"Fetching quiz_empids data for EMPID and Visitid")
df = pl.read_database(
query=sql,
connection=engine
)
log.info(f"Fetched {len(df):,} total empid and visitdate fetched for OQAD from SQL Server")
log.info("Fetching quiz empids for run_date=%s", run_date_str)
df = pl.read_database(query=sql, connection=engine)
log.info("Fetched %s (EmpId, VisitDate) pairs from SQL Server", len(df))
return df return df
# ── STEP 2: Who do we ALREADY have in ClickHouse? ───
def get_empids_clickhouse_OQAD( def get_empids_clickhouse_OQAD(
client, client,
table_name: str = "OQaD", table_name: str = "OQaD",
) -> pl.DataFrame: ) -> pl.DataFrame:
if not table_exists(client, table_name): if not table_exists(client, table_name):
log.warning(f"Table '{table_name}' does not exist.") log.warning("Table '%s' does not exist in ClickHouse.", table_name)
return pl.DataFrame( return pl.DataFrame(schema={"empid": pl.Int64, "visitdate": pl.Date})
schema={
"empid": pl.Int64,
"visitdate": pl.Date,
}
)
query = f""" query = f"""
SELECT DISTINCT SELECT DISTINCT
@@ -227,82 +230,249 @@ def fetch_OQaD(
FROM {table_name} FROM {table_name}
""" """
# ClickHouse -> PyArrow -> Polars
arrow_table = client.query_arrow(query) arrow_table = client.query_arrow(query)
df = pl.from_arrow(arrow_table)
log.info("Fetched %s existing (EmpId, VisitDate) pairs from ClickHouse", len(df))
return df
return pl.from_arrow(arrow_table) # ── STEP 3: Who is NEW? (in SQL Server but NOT yet in ClickHouse) ───
def find_new_empids(
sql_df: pl.DataFrame,
ch_df: pl.DataFrame,
) -> list[int]:
new_df = sql_df.join(
qf=fetch_quiz_empids(sql_engine,run_date) ch_df,
db_df = get_empids_clickhouse_OQAD(client)
matched = qf.join(
db_df,
on=["empid", "visitdate"], on=["empid", "visitdate"],
how="inner", how="anti", # ✅ anti = keep rows NOT found in ch_df
) )
if matched.is_empty(): if new_df.is_empty():
log.warning("No new EmpIds found for table=%s — nothing to fetch.", table_name)
return [0] # sentinel value — the .sql WHERE will return 0 rows safely
empids=[0] empids = new_df["empid"].unique().to_list()
log.warning( log.info("Found %s NEW empids to fetch for %s", len(empids), table_name)
"%s Matched df in OQaD returned no rows", return empids
table_name,
)
else:
empids=matched["empid"].to_list()
log.info(f"Fetched {len(empids):,} matched empids fetched for OQAD ")
# ── STEP 4: Fetch full quiz data for new empids ───
def fetch_data( def fetch_data(
engine: Engine, engine: Engine,
table_name: str, table_name: str,
table_type: str, table_type: str,
empids: list[int], empids: list[int],
run_date: date run_date: date,
) -> pl.DataFrame: ) -> pl.DataFrame:
empid_list = ",".join(str(empid) for empid in empids)
run_date_str = run_date.strftime("%Y-%m-%d")
empid_list = ", ".join(str(e) for e in empids) # "101, 102, 103"
sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql" sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql"
log.info("Loading SQL from: %s (exists=%s)", sql_file.resolve(), sql_file.exists())
log.info(f"Exists: {sql_file.exists()}")
log.info(f"Path: {sql_file.resolve()}")
with open(sql_file, "r", encoding="utf-8") as f: with open(sql_file, "r", encoding="utf-8") as f:
sql_template = f.read() sql_template = f.read()
sql = sql_template.format( sql = sql_template.format(
empid_list=empid_list, empid_list=empid_list,
run_date=run_date.strftime("%Y-%m-%d") run_date=run_date_str,
) )
log.info(f"Fetching data for {len(empids):,} EMPIDs") log.info("Fetching full OQaD data for %s empids, run_date=%s", len(empids), run_date_str)
df = pl.read_database(query=sql, connection=engine)
log.info("Fetching OQaD data for run_date=%s", run_date) log.info("Fetched %s rows from SQL Server for table=%s", len(df), table_name)
df = pl.read_database(
query=sql,
connection=engine,
)
log.info("fn name is fetch_OQad ------Fetched %s rows", len(df))
return df return df
df=fetch_data( engine=sql_engine,
# ─────────────────────────────────────────────
# MAIN FLOW (the 4 steps, clearly sequenced)
# ─────────────────────────────────────────────
qf = fetch_quiz_empids(sql_engine, run_date) # Step 1
db_df = get_empids_clickhouse_OQAD(client, table_name) # Step 2
empids = find_new_empids(qf, db_df) # Step 3
df = fetch_data( # Step 4
engine=sql_engine,
table_name=table_name, table_name=table_name,
table_type=table_type, table_type=table_type,
empids=empids, empids=empids,
run_date=run_date run_date=run_date,
) )
log.info(f"Fetched {len(df):,} rows from SQL Server")
log.info("fetch_OQaD complete — returning %s rows", len(df))
return df return df
# def fetch_OQaD(
# sql_engine: Engine,
# table_name: str,
# table_type: str,
# mids: list[int],
# run_date: date
# ) -> pl.DataFrame:
# client= get_clickhouse_client()
# def table_exists(
# client,
# table_name: str,
# ) -> bool:
# return bool(
# client.command(
# f"EXISTS TABLE {table_name}"
# )
# )
# def fetch_quiz_empids(engine: Engine, run_date : date) -> pl.DataFrame:
# sql_template = f"""
# WITH MID_TABLE_COV1 AS
# (
# SELECT EmpId, VisitDate
# FROM OneApp_KelloggsMT.dbo.T_OQAD
# WHERE CreateDate >= {run_date}
# AND CreateDate < DATEADD(DAY,1,'{run_date}')
# UNION
# SELECT EmpId, VisitDate
# FROM OneApp_KelloggsMT.dbo.T_OQAD
# WHERE UpdateDate >= {run_date}
# AND UpdateDate < DATEADD(DAY,1, '{run_date}')
# ),
# QUIZ AS
# (
# SELECT Distinct E.EmpId as empid
# , CONVERT(date,DQ.VisitDate) AS visitdate
# FROM OneApp_KelloggsMT.dbo.T_OQAD DQ INNER JOIN
# OneApp_KelloggsMT.dbo.vw_Employee_Detail E ON DQ.EmpId = E.EmpId inner join
# OneApp_KelloggsMT.dbo.Master_OQAD_Question QU on DQ.QuestionId= qu.QuestionId inner join
# OneApp_KelloggsMT.dbo.Master_OQAD_Category qc on qu.QuestionCategoryId= qc.QuestionCategoryId
# where e.EmpName not like 'test%' and e.RightId in (6)
# and (E.ResignDate is null or E.ResignDate>=''+CONVERT(VARCHAR,'{run_date}')+'') AND E.EmpName NOT LIKE '%TEST%'
# AND DQ.EmpId IN (SELECT EmpId FROM MID_TABLE_COV1 A WHERE
# DQ.EmpId=A.EmpId AND CONVERT(date,VisitDate)=CONVERT(date,A.VisitDate) )
# ) select * from quiz
# """
# sql = sql_template.format(
# run_date=run_date.strftime("%Y-%m-%d")
# )
# log.info(f"Fetching quiz_empids data for EMPID and Visitid")
# df = pl.read_database(
# query=sql,
# connection=engine
# )
# log.info(f"Fetched {len(df):,} total empid and visitdate fetched for OQAD from SQL Server")
# return df
# def get_empids_clickhouse_OQAD(
# client,
# table_name: str = "OQaD",
# ) -> pl.DataFrame:
# if not table_exists(client, table_name):
# log.warning(f"Table '{table_name}' does not exist.")
# return pl.DataFrame(
# schema={
# "empid": pl.Int64,
# "visitdate": pl.Date,
# }
# )
# query = f"""
# SELECT DISTINCT
# employee_id AS empid,
# visit_date AS visitdate
# FROM {table_name}
# """
# # ClickHouse -> PyArrow -> Polars
# arrow_table = client.query_arrow(query)
# return pl.from_arrow(arrow_table)
# qf=fetch_quiz_empids(sql_engine,run_date)
# db_df = get_empids_clickhouse_OQAD(client)
# matched = qf.join(
# db_df,
# on=["empid", "visitdate"],
# how="inner",
# )
# if matched.is_empty():
# empids=[0]
# log.warning(
# "%s Matched df in OQaD returned no rows",
# table_name,
# )
# else:
# empids=matched["empid"].to_list()
# log.info(f"Fetched {len(empids):,} matched empids fetched for OQAD ")
# def fetch_data(
# engine: Engine,
# table_name: str,
# table_type: str,
# empids: list[int],
# run_date: date
# ) -> pl.DataFrame:
# empid_list = ",".join(str(empid) for empid in empids)
# sql_file = Path("src") / "sql" / "fact" / f"{table_name}.sql"
# log.info(f"Exists: {sql_file.exists()}")
# log.info(f"Path: {sql_file.resolve()}")
# with open(sql_file, "r", encoding="utf-8") as f:
# sql_template = f.read()
# sql = sql_template.format(
# empid_list=empid_list,
# run_date=run_date.strftime("%Y-%m-%d")
# )
# log.info(f"Fetching data for {len(empids):,} EMPIDs")
# log.info("Fetching OQaD data for run_date=%s", run_date)
# df = pl.read_database(
# query=sql,
# connection=engine,
# )
# log.info("fn name is fetch_OQad ------Fetched %s rows", len(df))
# return df
# df=fetch_data( engine=sql_engine,
# table_name=table_name,
# table_type=table_type,
# empids=empids,
# run_date=run_date
# )
# log.info(f"Fetched {len(df):,} rows from SQL Server")
# return df
# def fetch_OQaD( # def fetch_OQaD(
# engine: Engine, # engine: Engine,
+4 -5
View File
@@ -1,11 +1,10 @@
with mapping_storevisibility with mapping_storevisibility
(Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,CreateDate,CreateBy) (Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,CreateDate,CreateBy)
AS ( AS (
select DISTINCT '40148' as Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,getdate(),'SP-Pius' select DISTINCT '40148' as Project_Id,StoreId,VisibilityDefinitionid,Fromdate,Todate,getdate(),'SP-Pius'
FROM OneApp_KelloggsMT.dbo.mapping_storevisibility z WHERE FROM OneApp_KelloggsMT.dbo.mapping_storevisibility
convert(date,FROMDATE,101)<=convert(Date,getdate(),101) AND CONVERT(DATE,ToDate,101)>=convert(Date,getdate(),101)
AND z.VisibilityDefinitionid IN
(SELECT DISTINCT VisibilityDefinitionid FROM OneApp_KelloggsMT.dbo.MASTER_VISIBILITYDEFINITION WHERE MENUID=22 )
AND z.StoreId NOT IN ({store_id_list})
) )
select * from mapping_storevisibility select * from mapping_storevisibility
+14 -19
View File
@@ -1,19 +1,19 @@
WITH MID_TABLE_COV1 AS WITH MID_TABLE_COV1 AS
( (
SELECT EmpId, VisitDate
SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
FROM OneApp_KelloggsMT.dbo.T_OQAD FROM OneApp_KelloggsMT.dbo.T_OQAD
WHERE CreateDate >= {run_date} WHERE CreateDate >= '{run_date}'
AND CreateDate < DATEADD(DAY,1,'{run_date}') AND CreateDate < DATEADD(DAY, 1, '{run_date}')
UNION ALL UNION ALL
SELECT EmpId, VisitDate SELECT EmpId, CAST(VisitDate AS DATE) AS VisitDate
FROM OneApp_KelloggsMT.dbo.T_OQAD FROM OneApp_KelloggsMT.dbo.T_OQAD
WHERE UpdateDate >= {run_date} WHERE UpdateDate >= '{run_date}'
AND UpdateDate < DATEADD(DAY,1, '{run_date}') AND UpdateDate < DATEADD(DAY, 1, '{run_date}')
), ),
QUIZ AS QUIZ AS
( (
SELECT DISTINCT SELECT DISTINCT
@@ -39,17 +39,15 @@ QUIZ AS
ON QU.QuestionCategoryId = QC.QuestionCategoryId ON QU.QuestionCategoryId = QC.QuestionCategoryId
WHERE E.EmpName NOT LIKE '%TEST%' WHERE E.EmpName NOT LIKE '%TEST%'
AND E.RightId = 6 AND E.RightId = 6
AND ( AND (E.ResignDate IS NULL OR CAST(E.ResignDate AS DATE) >= '{run_date}')
E.ResignDate IS NULL AND EXISTS (
OR CAST(E.ResignDate AS DATE) >= '{run_date}'
)
AND EXISTS
(
SELECT 1 SELECT 1
FROM MID_TABLE_COV1 A FROM MID_TABLE_COV1 A
WHERE A.EmpId = DQ.EmpId WHERE A.EmpId = DQ.EmpId
AND CAST(A.VisitDate AS DATE) = CAST(DQ.VisitDate AS DATE) AND A.VisitDate = CAST(DQ.VisitDate AS DATE)
) )
-- ✅ Exclude EmpIds already loaded into ClickHouse
AND E.EmpId NOT IN ({empid_list})
) )
SELECT SELECT
@@ -61,8 +59,8 @@ SELECT
Q.QuestionCategory AS question_category, Q.QuestionCategory AS question_category,
QM.QuestionId AS question_id, QM.QuestionId AS question_id,
QM.Question AS question, QM.Question AS question,
ISNULL(QA.AnswerId,0) AS answer_id, ISNULL(QA.AnswerId, 0) AS answer_id,
ISNULL(QA.Answer,'') AS answer, ISNULL(QA.Answer, '') AS answer,
CASE CASE
WHEN QA.AnswerId IS NULL THEN 'Not Answer' WHEN QA.AnswerId IS NULL THEN 'Not Answer'
WHEN QA.RightAnswer = 1 THEN 'Y' WHEN QA.RightAnswer = 1 THEN 'Y'
@@ -76,6 +74,3 @@ INNER JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Question QM
ON Q.QuestionId = QM.QuestionId ON Q.QuestionId = QM.QuestionId
LEFT JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Answer QA LEFT JOIN OneApp_KelloggsMT.dbo.Master_OQAD_Answer QA
ON Q.AnswerId = QA.AnswerId ON Q.AnswerId = QA.AnswerId
where Q.EmpId not in ({empid_list})
+2
View File
@@ -0,0 +1,2 @@
- pipeline_trigeered_on_date: '2026-06-23'
last_successful_run_date: 2026-06-22
+7 -6
View File
@@ -4,11 +4,6 @@ tables:
operation: INSERT operation: INSERT
fetch_by: mids fetch_by: mids
- name: OQaD
type: FACT
operation: INSERT
fetch_by: run_date
- name: Survey - name: Survey
type: FACT type: FACT
operation: INSERT operation: INSERT
@@ -87,7 +82,7 @@ tables:
- name: mapping_storevisibility - name: mapping_storevisibility
type: BRIDGE type: BRIDGE
operation: ONLY_INSERT operation: DELETE+INSERT
fetch_by: run_date fetch_by: run_date
- name: Master_VisibilityReason - name: Master_VisibilityReason
@@ -105,3 +100,9 @@ tables:
type: DIMENSION type: DIMENSION
operation: DELETE+INSERT operation: DELETE+INSERT
fetch_by: none fetch_by: none
- name: OQaD
type: FACT
operation: INSERT
fetch_by: run_date
+8 -8
View File
@@ -4,10 +4,10 @@ tables:
# operation: INSERT # operation: INSERT
# fetch_by: mids # fetch_by: mids
# # - name: OQaD - name: OQaD
# # type: FACT type: FACT
# # operation: INSERT operation: INSERT
# # fetch_by: run_date fetch_by: run_date
# - name: additional_visibility # - name: additional_visibility
# type: FACT # type: FACT
@@ -102,8 +102,8 @@ tables:
# operation: DELETE+INSERT # operation: DELETE+INSERT
# fetch_by: none # fetch_by: none
- name: Promotion # - name: Promotion
type: FACT # type: FACT
operation: INSERT # operation: INSERT
fetch_by: mids # fetch_by: mids