[Web Logins] Import

2026-05-19 16:58:54 +05:30
parent 379ead3d8e
commit 3be8cd7259
7 changed files with 11034 additions and 0 deletions
@@ -0,0 +1,545 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("ETL Started :", datetime.now())
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE NAME
 # =========================================================
 TABLE_NAME = 'OQaD'
 PROJECT_ID = 41654
 # =========================================================
 # LOAD SETTINGS
 # =========================================================
 TRUNCATE_BEFORE_LOAD = True
 table_truncated = False
 # =========================================================
 # CLICKHOUSE DATE COLUMNS
 # =========================================================
 DATE_COLUMNS = [
    'visit_date'
 ]
 # =========================================================
 # CLICKHOUSE DATETIME COLUMNS
 # =========================================================
 DATETIME_COLUMNS = [
    'create_date',
    'update_date'
 ]
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Column Wise
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATE32 COLUMNS
                # =====================================
                if col.lower() in [
                    x.lower() for x in DATE_COLUMNS
                ]:
                    print(f"Date32 Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    # Remove invalid dates
                    df[col] = df[col].where(
                        (df[col].dt.year >= 1970) &
                        (df[col].dt.year <= 2100)
                    )
                    # Convert to datetime.date
                    df[col] = df[col].apply(
                        lambda x:
                        x.date()
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # DATETIME64 COLUMNS
                # =====================================
                elif col.lower() in [
                    x.lower() for x in DATETIME_COLUMNS
                ]:
                    print(f"DateTime Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    # Remove invalid dates
                    df[col] = df[col].where(
                        (df[col].dt.year >= 1970) &
                        (df[col].dt.year <= 2100)
                    )
                    # Convert to datetime.datetime
                    df[col] = df[col].apply(
                        lambda x:
                        x.to_pydatetime()
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column : {col}")
                    non_null = df[col].dropna()
                    # Convert whole float to int
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column : {col}")
                    cleaned = []
                    for val in df[col]:
                        # NULL
                        if pd.isnull(val):
                            cleaned.append(None)
                        # INTEGER
                        elif isinstance(
                            val,
                            (
                                int,
                                np.integer
                            )
                        ):
                            cleaned.append(int(val))
                        # FLOAT
                        elif isinstance(
                            val,
                            (
                                float,
                                np.floating
                            )
                        ):
                            if np.isnan(val):
                                cleaned.append(None)
                            else:
                                if val.is_integer():
                                    cleaned.append(int(val))
                                else:
                                    cleaned.append(float(val))
                        # STRING
                        elif isinstance(val, str):
                            cleaned.append(val.strip())
                        # BOOLEAN
                        elif isinstance(val, bool):
                            cleaned.append(int(val))
                        # DATETIME
                        elif isinstance(
                            val,
                            (
                                datetime,
                                pd.Timestamp
                            )
                        ):
                            if isinstance(
                                val,
                                pd.Timestamp
                            ):
                                cleaned.append(
                                    val.to_pydatetime()
                                )
                            else:
                                cleaned.append(val)
                        # OTHER
                        else:
                            cleaned.append(str(val))
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE Project_Id = {PROJECT_ID}
    """
    print("\nExecuting Query:")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n================================")
            print(f"Processing {len(chunk)} Rows")
            print("================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                sample = chunk[col].dropna()
                if len(sample) > 0:
                    print(
                        col,
                        type(sample.iloc[0]),
                        sample.iloc[0]
                    )
            # =================================================
            # DEBUG DATE COLUMN
            # =================================================
            if 'visit_date' in chunk.columns:
                print("\nvisit_date Sample")
                print(chunk['visit_date'].head())
                sample = chunk['visit_date'].dropna()
                if len(sample) > 0:
                    print(
                        "visit_date datatype:",
                        type(sample.iloc[0])
                    )
            # =================================================
            # TRUNCATE TABLE FIRST TIME ONLY
            # =================================================
            if TRUNCATE_BEFORE_LOAD and not table_truncated:
                print("\n================================")
                print(f"TRUNCATING TABLE : {TABLE_NAME}")
                print("================================")
                truncate_query = f"""
                TRUNCATE TABLE
                {CH_CONFIG['database']}.{TABLE_NAME}
                """
                ch_client.command(truncate_query)
                print("TABLE TRUNCATED SUCCESSFULLY")
                table_truncated = True
            # =================================================
            # INSERT INTO CLICKHOUSE
            # =================================================
            print("\nInserting into ClickHouse...")
            ch_client.insert_df(
                table=TABLE_NAME,
                df=chunk,
                database=CH_CONFIG['database']
            )
            total_rows += len(chunk)
            print(
                f"\nInserted Total Rows : {total_rows}"
            )
        except Exception as chunk_error:
            print("\n================================")
            print("CHUNK INSERT FAILED")
            print("================================")
            print(str(chunk_error))
            traceback.print_exc()
            # =============================================
            # SAVE ERROR LOG
            # =============================================
            with open(
                "clickhouse_chunk_error.log",
                "a",
                encoding="utf-8"
            ) as log:
                log.write(
                    "\n\n================================"
                )
                log.write(
                    f"\nTIME : {datetime.now()}"
                )
                log.write(
                    f"\nTABLE : {TABLE_NAME}"
                )
                log.write(
                    f"\nERROR : {str(chunk_error)}"
                )
                log.write(
                    f"\nTRACEBACK :\n"
                    f"{traceback.format_exc()}"
                )
                log.write(
                    "\n================================"
                )
            continue
    print("\n================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n================================")
    print("MAIN ERROR")
    print("================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\nETL Finished :", datetime.now())
@@ -0,0 +1,572 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("ETL Started :", datetime.now())
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE NAME
 # =========================================================
 TABLE_NAME = 'PaidVisibility_Compliance'
 PROJECT_ID = 41654
 # =========================================================
 # LOAD SETTINGS
 # =========================================================
 TRUNCATE_BEFORE_LOAD = True
 table_truncated = False
 # =========================================================
 # CLICKHOUSE DATE COLUMNS
 # =========================================================
 DATE_COLUMNS = [
    'visit_date'
 ]
 # =========================================================
 # CLICKHOUSE DATETIME COLUMNS
 # =========================================================
 DATETIME_COLUMNS = [
    'create_date',
    'update_date'
 ]
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Columns
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATE32 COLUMNS
                # =====================================
                if col.lower() in [
                    x.lower() for x in DATE_COLUMNS
                ]:
                    print(f"Date32 Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_dates = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_dates.append(None)
                        else:
                            # IMPORTANT FIX
                            cleaned_dates.append(
                                val.date()
                            )
                    df[col] = cleaned_dates
                # =====================================
                # DATETIME COLUMNS
                # =====================================
                elif col.lower() in [
                    x.lower() for x in DATETIME_COLUMNS
                ]:
                    print(f"DateTime Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_datetime = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_datetime.append(None)
                        else:
                            # IMPORTANT FIX
                            cleaned_datetime.append(
                                val.to_pydatetime()
                            )
                    df[col] = cleaned_datetime
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    non_null = df[col].dropna()
                    # Convert float to int if possible
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column : {col}")
                    cleaned = []
                    for val in df[col]:
                        # NULL
                        if pd.isnull(val):
                            cleaned.append(None)
                        # STRING
                        elif isinstance(val, str):
                            cleaned.append(
                                val.strip()
                            )
                        # BOOLEAN
                        elif isinstance(val, bool):
                            cleaned.append(
                                int(val)
                            )
                        # INTEGER
                        elif isinstance(
                            val,
                            (
                                int,
                                np.integer
                            )
                        ):
                            cleaned.append(
                                int(val)
                            )
                        # FLOAT
                        elif isinstance(
                            val,
                            (
                                float,
                                np.floating
                            )
                        ):
                            if np.isnan(val):
                                cleaned.append(None)
                            else:
                                if val.is_integer():
                                    cleaned.append(
                                        int(val)
                                    )
                                else:
                                    cleaned.append(
                                        float(val)
                                    )
                        # DATETIME
                        elif isinstance(
                            val,
                            (
                                datetime,
                                pd.Timestamp
                            )
                        ):
                            if isinstance(
                                val,
                                pd.Timestamp
                            ):
                                cleaned.append(
                                    val.to_pydatetime()
                                )
                            else:
                                cleaned.append(val)
                        # DATE
                        elif hasattr(val, 'year') and hasattr(val, 'month'):
                            cleaned.append(val)
                        # OTHER
                        else:
                            cleaned.append(
                                str(val)
                            )
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE Project_Id = {PROJECT_ID}
    """
    print("\nExecuting Query:")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n================================")
            print(f"Processing {len(chunk)} Rows")
            print("================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                sample = chunk[col].dropna()
                if len(sample) > 0:
                    print(
                        col,
                        type(sample.iloc[0]),
                        sample.iloc[0]
                    )
            # =================================================
            # DEBUG DATE COLUMN
            # =================================================
            if 'visit_date' in chunk.columns:
                print("\nvisit_date Sample")
                print(chunk['visit_date'].head())
                sample = chunk['visit_date'].dropna()
                if len(sample) > 0:
                    print(
                        "visit_date datatype:",
                        type(sample.iloc[0])
                    )
            # =================================================
            # TRUNCATE TABLE
            # =================================================
            if TRUNCATE_BEFORE_LOAD and not table_truncated:
                print("\n================================")
                print(f"TRUNCATING TABLE : {TABLE_NAME}")
                print("================================")
                truncate_query = f"""
                TRUNCATE TABLE
                {CH_CONFIG['database']}.{TABLE_NAME}
                """
                ch_client.command(truncate_query)
                print("TABLE TRUNCATED SUCCESSFULLY")
                table_truncated = True
            # =================================================
            # INSERT INTO CLICKHOUSE
            # =================================================
            print("\nInserting into ClickHouse...")
            ch_client.insert_df(
                table=TABLE_NAME,
                df=chunk,
                database=CH_CONFIG['database']
            )
            total_rows += len(chunk)
            print(
                f"\nInserted Total Rows : {total_rows}"
            )
        except Exception as chunk_error:
            print("\n================================")
            print("CHUNK INSERT FAILED")
            print("================================")
            print(str(chunk_error))
            traceback.print_exc()
            # =============================================
            # SAVE ERROR LOG
            # =============================================
            with open(
                "clickhouse_chunk_error.log",
                "a",
                encoding="utf-8"
            ) as log:
                log.write(
                    "\n\n================================"
                )
                log.write(
                    f"\nTIME : {datetime.now()}"
                )
                log.write(
                    f"\nTABLE : {TABLE_NAME}"
                )
                log.write(
                    f"\nERROR : {str(chunk_error)}"
                )
                log.write(
                    f"\nTRACEBACK :\n"
                    f"{traceback.format_exc()}"
                )
                log.write(
                    "\n================================"
                )
            continue
    print("\n================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n================================")
    print("MAIN ERROR")
    print("================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\nETL Finished :", datetime.now())
@@ -0,0 +1,545 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("ETL Started :", datetime.now())
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE NAME
 # =========================================================
 TABLE_NAME = 'PaidVisibility'
 PROJECT_ID = 41654
 # =========================================================
 # LOAD SETTINGS
 # =========================================================
 TRUNCATE_BEFORE_LOAD = True
 table_truncated = False
 # =========================================================
 # CLICKHOUSE DATE COLUMNS
 # =========================================================
 DATE_COLUMNS = [
    'visit_date'
 ]
 # =========================================================
 # CLICKHOUSE DATETIME COLUMNS
 # =========================================================
 DATETIME_COLUMNS = [
    'create_date',
    'update_date'
 ]
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Column Wise
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATE32 COLUMNS
                # =====================================
                if col.lower() in [
                    x.lower() for x in DATE_COLUMNS
                ]:
                    print(f"Date32 Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    # Remove invalid dates
                    df[col] = df[col].where(
                        (df[col].dt.year >= 1970) &
                        (df[col].dt.year <= 2100)
                    )
                    # Convert to datetime.date
                    df[col] = df[col].apply(
                        lambda x:
                        x.date()
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # DATETIME64 COLUMNS
                # =====================================
                elif col.lower() in [
                    x.lower() for x in DATETIME_COLUMNS
                ]:
                    print(f"DateTime Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    # Remove invalid dates
                    df[col] = df[col].where(
                        (df[col].dt.year >= 1970) &
                        (df[col].dt.year <= 2100)
                    )
                    # Convert to datetime.datetime
                    df[col] = df[col].apply(
                        lambda x:
                        x.to_pydatetime()
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column : {col}")
                    non_null = df[col].dropna()
                    # Convert whole float to int
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column : {col}")
                    cleaned = []
                    for val in df[col]:
                        # NULL
                        if pd.isnull(val):
                            cleaned.append(None)
                        # INTEGER
                        elif isinstance(
                            val,
                            (
                                int,
                                np.integer
                            )
                        ):
                            cleaned.append(int(val))
                        # FLOAT
                        elif isinstance(
                            val,
                            (
                                float,
                                np.floating
                            )
                        ):
                            if np.isnan(val):
                                cleaned.append(None)
                            else:
                                if val.is_integer():
                                    cleaned.append(int(val))
                                else:
                                    cleaned.append(float(val))
                        # STRING
                        elif isinstance(val, str):
                            cleaned.append(val.strip())
                        # BOOLEAN
                        elif isinstance(val, bool):
                            cleaned.append(int(val))
                        # DATETIME
                        elif isinstance(
                            val,
                            (
                                datetime,
                                pd.Timestamp
                            )
                        ):
                            if isinstance(
                                val,
                                pd.Timestamp
                            ):
                                cleaned.append(
                                    val.to_pydatetime()
                                )
                            else:
                                cleaned.append(val)
                        # OTHER
                        else:
                            cleaned.append(str(val))
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE Project_Id = {PROJECT_ID}
    """
    print("\nExecuting Query:")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n================================")
            print(f"Processing {len(chunk)} Rows")
            print("================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                sample = chunk[col].dropna()
                if len(sample) > 0:
                    print(
                        col,
                        type(sample.iloc[0]),
                        sample.iloc[0]
                    )
            # =================================================
            # DEBUG DATE COLUMN
            # =================================================
            if 'visit_date' in chunk.columns:
                print("\nvisit_date Sample")
                print(chunk['visit_date'].head())
                sample = chunk['visit_date'].dropna()
                if len(sample) > 0:
                    print(
                        "visit_date datatype:",
                        type(sample.iloc[0])
                    )
            # =================================================
            # TRUNCATE TABLE FIRST TIME ONLY
            # =================================================
            if TRUNCATE_BEFORE_LOAD and not table_truncated:
                print("\n================================")
                print(f"TRUNCATING TABLE : {TABLE_NAME}")
                print("================================")
                truncate_query = f"""
                TRUNCATE TABLE
                {CH_CONFIG['database']}.{TABLE_NAME}
                """
                ch_client.command(truncate_query)
                print("TABLE TRUNCATED SUCCESSFULLY")
                table_truncated = True
            # =================================================
            # INSERT INTO CLICKHOUSE
            # =================================================
            print("\nInserting into ClickHouse...")
            ch_client.insert_df(
                table=TABLE_NAME,
                df=chunk,
                database=CH_CONFIG['database']
            )
            total_rows += len(chunk)
            print(
                f"\nInserted Total Rows : {total_rows}"
            )
        except Exception as chunk_error:
            print("\n================================")
            print("CHUNK INSERT FAILED")
            print("================================")
            print(str(chunk_error))
            traceback.print_exc()
            # =============================================
            # SAVE ERROR LOG
            # =============================================
            with open(
                "clickhouse_chunk_error.log",
                "a",
                encoding="utf-8"
            ) as log:
                log.write(
                    "\n\n================================"
                )
                log.write(
                    f"\nTIME : {datetime.now()}"
                )
                log.write(
                    f"\nTABLE : {TABLE_NAME}"
                )
                log.write(
                    f"\nERROR : {str(chunk_error)}"
                )
                log.write(
                    f"\nTRACEBACK :\n"
                    f"{traceback.format_exc()}"
                )
                log.write(
                    "\n================================"
                )
            continue
    print("\n================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n================================")
    print("MAIN ERROR")
    print("================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\nETL Finished :", datetime.now())
@@ -0,0 +1,632 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("\n====================================")
 print("ETL Started :", datetime.now())
 print("====================================")
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE DETAILS
 # =========================================================
 TABLE_NAME = 'SKU Master'
 PROJECT_ID = 41654
 # =========================================================
 # SETTINGS
 # =========================================================
 TRUNCATE_BEFORE_LOAD = True
 table_truncated = False
 # =========================================================
 # CLICKHOUSE DATE COLUMNS
 # =========================================================
 DATE_COLUMNS = [
    'visit_date'
 ]
 # =========================================================
 # CLICKHOUSE DATETIME COLUMNS
 # =========================================================
 DATETIME_COLUMNS = [
    'create_date',
    'update_date'
 ]
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN with None
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Each Column
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATE COLUMNS
                # =====================================
                if col.lower() in [
                    x.lower() for x in DATE_COLUMNS
                ]:
                    print(f"Date Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_dates = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_dates.append(None)
                        else:
                            cleaned_dates.append(
                                val.date()
                            )
                    df[col] = cleaned_dates
                # =====================================
                # DATETIME COLUMNS
                # =====================================
                elif col.lower() in [
                    x.lower() for x in DATETIME_COLUMNS
                ]:
                    print(f"DateTime Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_datetime = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_datetime.append(None)
                        else:
                            cleaned_datetime.append(
                                val.to_pydatetime()
                            )
                    df[col] = cleaned_datetime
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    non_null = df[col].dropna()
                    # ---------------------------------
                    # Convert whole float to int
                    # Example:
                    # 12.0 -> 12
                    # ---------------------------------
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column : {col}")
                    cleaned = []
                    for val in df[col]:
                        try:
                            # -------------------------
                            # NULL
                            # -------------------------
                            if pd.isnull(val):
                                cleaned.append(None)
                            # -------------------------
                            # STRING
                            # -------------------------
                            elif isinstance(val, str):
                                cleaned.append(
                                    val.strip()
                                )
                            # -------------------------
                            # BOOLEAN
                            # -------------------------
                            elif isinstance(val, bool):
                                cleaned.append(
                                    int(val)
                                )
                            # -------------------------
                            # INTEGER
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    int,
                                    np.integer
                                )
                            ):
                                cleaned.append(
                                    int(val)
                                )
                            # -------------------------
                            # FLOAT
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    float,
                                    np.floating
                                )
                            ):
                                if np.isnan(val):
                                    cleaned.append(None)
                                else:
                                    if val.is_integer():
                                        cleaned.append(
                                            int(val)
                                        )
                                    else:
                                        cleaned.append(
                                            float(val)
                                        )
                            # -------------------------
                            # DATETIME
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    datetime,
                                    pd.Timestamp
                                )
                            ):
                                if isinstance(
                                    val,
                                    pd.Timestamp
                                ):
                                    cleaned.append(
                                        val.to_pydatetime()
                                    )
                                else:
                                    cleaned.append(val)
                            # -------------------------
                            # DATE
                            # -------------------------
                            elif hasattr(val, 'year'):
                                cleaned.append(val)
                            # -------------------------
                            # OTHER
                            # -------------------------
                            else:
                                cleaned.append(
                                    str(val)
                                )
                        except Exception as row_error:
                            print(
                                f"Row Cleaning Error "
                                f"in Column {col}"
                            )
                            print(str(row_error))
                            cleaned.append(None)
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print(traceback.format_exc())
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    print("\nConnecting SQL Server...")
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    print("\nConnecting ClickHouse...")
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE Project_Id = {PROJECT_ID}
    """
    print("\n====================================")
    print("Executing Query")
    print("====================================")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n====================================")
            print(f"Processing Rows : {len(chunk)}")
            print("====================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                try:
                    sample = chunk[col].dropna()
                    if len(sample) > 0:
                        print(
                            col,
                            type(sample.iloc[0]),
                            sample.iloc[0]
                        )
                except:
                    pass
            # =================================================
            # TRUNCATE TABLE
            # =================================================
            if TRUNCATE_BEFORE_LOAD and not table_truncated:
                try:
                    print("\n====================================")
                    print(f"TRUNCATING : {TABLE_NAME}")
                    print("====================================")
                    truncate_query = f"""
                    TRUNCATE TABLE
                    `{CH_CONFIG['database']}`.`{TABLE_NAME}`
                    """
                    print(truncate_query)
                    ch_client.command(
                        truncate_query
                    )
                    print(
                        "TABLE TRUNCATED SUCCESSFULLY"
                    )
                    table_truncated = True
                except Exception as truncate_error:
                    print("\nTRUNCATE FAILED")
                    print(str(truncate_error))
                    raise
            # =================================================
            # INSERT DATA
            # =================================================
            try:
                print("\n====================================")
                print("INSERTING DATA INTO CLICKHOUSE")
                print("====================================")
                ch_client.insert_df(
                    table=f"`{TABLE_NAME}`",
                    df=chunk,
                    database=CH_CONFIG['database']
                )
                total_rows += len(chunk)
                print(
                    f"\nTOTAL INSERTED : "
                    f"{total_rows}"
                )
            except Exception as insert_error:
                print("\nINSERT FAILED")
                print(str(insert_error))
                traceback.print_exc()
                # =============================================
                # SAVE ERROR LOG
                # =============================================
                with open(
                    "clickhouse_insert_error.log",
                    "a",
                    encoding="utf-8"
                ) as log:
                    log.write(
                        "\n\n================================"
                    )
                    log.write(
                        f"\nTIME : {datetime.now()}"
                    )
                    log.write(
                        f"\nTABLE : {TABLE_NAME}"
                    )
                    log.write(
                        f"\nERROR : {str(insert_error)}"
                    )
                    log.write(
                        f"\nTRACEBACK :\n"
                        f"{traceback.format_exc()}"
                    )
                    log.write(
                        "\n================================"
                    )
                continue
        except Exception as chunk_error:
            print("\n====================================")
            print("CHUNK PROCESS FAILED")
            print("====================================")
            print(str(chunk_error))
            traceback.print_exc()
            continue
    print("\n====================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("====================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n====================================")
    print("MAIN ERROR")
    print("====================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\n====================================")
 print("ETL Finished :", datetime.now())
 print("====================================")
@@ -0,0 +1,632 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("\n====================================")
 print("ETL Started :", datetime.now())
 print("====================================")
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE DETAILS
 # =========================================================
 TABLE_NAME = 'Web Logins'
 PROJECT_ID = 41654
 # =========================================================
 # SETTINGS
 # =========================================================
 TRUNCATE_BEFORE_LOAD = True
 table_truncated = False
 # =========================================================
 # CLICKHOUSE DATE COLUMNS
 # =========================================================
 DATE_COLUMNS = [
    'visit_date'
 ]
 # =========================================================
 # CLICKHOUSE DATETIME COLUMNS
 # =========================================================
 DATETIME_COLUMNS = [
    'create_date',
    'update_date'
 ]
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN with None
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Each Column
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATE COLUMNS
                # =====================================
                if col.lower() in [
                    x.lower() for x in DATE_COLUMNS
                ]:
                    print(f"Date Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_dates = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_dates.append(None)
                        else:
                            cleaned_dates.append(
                                val.date()
                            )
                    df[col] = cleaned_dates
                # =====================================
                # DATETIME COLUMNS
                # =====================================
                elif col.lower() in [
                    x.lower() for x in DATETIME_COLUMNS
                ]:
                    print(f"DateTime Column : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    cleaned_datetime = []
                    for val in df[col]:
                        if pd.isnull(val):
                            cleaned_datetime.append(None)
                        else:
                            cleaned_datetime.append(
                                val.to_pydatetime()
                            )
                    df[col] = cleaned_datetime
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    non_null = df[col].dropna()
                    # ---------------------------------
                    # Convert whole float to int
                    # Example:
                    # 12.0 -> 12
                    # ---------------------------------
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column : {col}")
                    cleaned = []
                    for val in df[col]:
                        try:
                            # -------------------------
                            # NULL
                            # -------------------------
                            if pd.isnull(val):
                                cleaned.append(None)
                            # -------------------------
                            # STRING
                            # -------------------------
                            elif isinstance(val, str):
                                cleaned.append(
                                    val.strip()
                                )
                            # -------------------------
                            # BOOLEAN
                            # -------------------------
                            elif isinstance(val, bool):
                                cleaned.append(
                                    int(val)
                                )
                            # -------------------------
                            # INTEGER
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    int,
                                    np.integer
                                )
                            ):
                                cleaned.append(
                                    int(val)
                                )
                            # -------------------------
                            # FLOAT
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    float,
                                    np.floating
                                )
                            ):
                                if np.isnan(val):
                                    cleaned.append(None)
                                else:
                                    if val.is_integer():
                                        cleaned.append(
                                            int(val)
                                        )
                                    else:
                                        cleaned.append(
                                            float(val)
                                        )
                            # -------------------------
                            # DATETIME
                            # -------------------------
                            elif isinstance(
                                val,
                                (
                                    datetime,
                                    pd.Timestamp
                                )
                            ):
                                if isinstance(
                                    val,
                                    pd.Timestamp
                                ):
                                    cleaned.append(
                                        val.to_pydatetime()
                                    )
                                else:
                                    cleaned.append(val)
                            # -------------------------
                            # DATE
                            # -------------------------
                            elif hasattr(val, 'year'):
                                cleaned.append(val)
                            # -------------------------
                            # OTHER
                            # -------------------------
                            else:
                                cleaned.append(
                                    str(val)
                                )
                        except Exception as row_error:
                            print(
                                f"Row Cleaning Error "
                                f"in Column {col}"
                            )
                            print(str(row_error))
                            cleaned.append(None)
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print(traceback.format_exc())
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    print("\nConnecting SQL Server...")
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    print("\nConnecting ClickHouse...")
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE Project_Id = {PROJECT_ID}
    """
    print("\n====================================")
    print("Executing Query")
    print("====================================")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n====================================")
            print(f"Processing Rows : {len(chunk)}")
            print("====================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                try:
                    sample = chunk[col].dropna()
                    if len(sample) > 0:
                        print(
                            col,
                            type(sample.iloc[0]),
                            sample.iloc[0]
                        )
                except:
                    pass
            # =================================================
            # TRUNCATE TABLE
            # =================================================
            if TRUNCATE_BEFORE_LOAD and not table_truncated:
                try:
                    print("\n====================================")
                    print(f"TRUNCATING : {TABLE_NAME}")
                    print("====================================")
                    truncate_query = f"""
                    TRUNCATE TABLE
                    `{CH_CONFIG['database']}`.`{TABLE_NAME}`
                    """
                    print(truncate_query)
                    ch_client.command(
                        truncate_query
                    )
                    print(
                        "TABLE TRUNCATED SUCCESSFULLY"
                    )
                    table_truncated = True
                except Exception as truncate_error:
                    print("\nTRUNCATE FAILED")
                    print(str(truncate_error))
                    raise
            # =================================================
            # INSERT DATA
            # =================================================
            try:
                print("\n====================================")
                print("INSERTING DATA INTO CLICKHOUSE")
                print("====================================")
                ch_client.insert_df(
                    table=f"`{TABLE_NAME}`",
                    df=chunk,
                    database=CH_CONFIG['database']
                )
                total_rows += len(chunk)
                print(
                    f"\nTOTAL INSERTED : "
                    f"{total_rows}"
                )
            except Exception as insert_error:
                print("\nINSERT FAILED")
                print(str(insert_error))
                traceback.print_exc()
                # =============================================
                # SAVE ERROR LOG
                # =============================================
                with open(
                    "clickhouse_insert_error.log",
                    "a",
                    encoding="utf-8"
                ) as log:
                    log.write(
                        "\n\n================================"
                    )
                    log.write(
                        f"\nTIME : {datetime.now()}"
                    )
                    log.write(
                        f"\nTABLE : {TABLE_NAME}"
                    )
                    log.write(
                        f"\nERROR : {str(insert_error)}"
                    )
                    log.write(
                        f"\nTRACEBACK :\n"
                        f"{traceback.format_exc()}"
                    )
                    log.write(
                        "\n================================"
                    )
                continue
        except Exception as chunk_error:
            print("\n====================================")
            print("CHUNK PROCESS FAILED")
            print("====================================")
            print(str(chunk_error))
            traceback.print_exc()
            continue
    print("\n====================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("====================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n====================================")
    print("MAIN ERROR")
    print("====================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\n====================================")
 print("ETL Finished :", datetime.now())
 print("====================================")
@@ -90,3 +90,80 @@ Traceback (most recent call last):
 NameError: name 'PROJECT_ID' is not defined. Did you mean: 'PROJECTID'?
 ================================
 ================================
 TIME : 2026-05-19 15:19:40.194819
 ERROR : ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: An existing connection was forcibly closed by the remote host.\r\n (10054) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10054)')
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\PaidVisibility_Import.py", line 345, in <module>
    for chunk in pd.read_sql(
                 ~~~~~~~~~~~^
        query,
        ^^^^^^
        sql_conn,
        ^^^^^^^^^
        chunksize=chunk_size
        ^^^^^^^^^^^^^^^^^^^^
    ):
    ^
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 2730, in _query_iterator
    data = cursor.fetchmany(chunksize)
 pyodbc.OperationalError: ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: An existing connection was forcibly closed by the remote host.\r\n (10054) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10054)')
 ================================
 ================================
 TIME : 2026-05-19 15:26:07.910371
 ERROR : ('08001', '[08001] [Microsoft][ODBC Driver 17 for SQL Server]Named Pipes Provider: Could not open a connection to SQL Server [5].  (5) (SQLDriverConnect); [08001] [Microsoft][ODBC Driver 17 for SQL Server]Login timeout expired (0); [08001] [Microsoft][ODBC Driver 17 for SQL Server]A network-related or instance-specific error has occurred while establishing a connection to SQL Server. Server is not found or not accessible. Check if instance name is correct and if SQL Server is configured to allow remote connections. For more information see SQL Server Books Online. (5)')
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\OQaD Import.py", line 310, in <module>
    sql_conn = pyodbc.connect(SQL_CONN_STR)
 pyodbc.OperationalError: ('08001', '[08001] [Microsoft][ODBC Driver 17 for SQL Server]Named Pipes Provider: Could not open a connection to SQL Server [5].  (5) (SQLDriverConnect); [08001] [Microsoft][ODBC Driver 17 for SQL Server]Login timeout expired (0); [08001] [Microsoft][ODBC Driver 17 for SQL Server]A network-related or instance-specific error has occurred while establishing a connection to SQL Server. Server is not found or not accessible. Check if instance name is correct and if SQL Server is configured to allow remote connections. For more information see SQL Server Books Online. (5)')
 ================================
 ================================
 TIME : 2026-05-19 16:01:03.630328
 ERROR : ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.\r\n (10060) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10060)')
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\PaidVisibility_Import.py", line 343, in <module>
    for chunk in pd.read_sql(
                 ~~~~~~~~~~~^
        query,
        ^^^^^^
        sql_conn,
        ^^^^^^^^^
        chunksize=chunk_size
        ^^^^^^^^^^^^^^^^^^^^
    ):
    ^
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 2730, in _query_iterator
    data = cursor.fetchmany(chunksize)
 pyodbc.OperationalError: ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.\r\n (10060) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10060)')
 ================================
 ================================
 TIME : 2026-05-19 16:36:09.557213
 ERROR : ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.\r\n (10060) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10060)')
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\PaidVisibility_Compliance Import.py", line 371, in <module>
    for chunk in pd.read_sql(
                 ~~~~~~~~~~~^
        query,
        ^^^^^^
        sql_conn,
        ^^^^^^^^^
        chunksize=chunk_size
        ^^^^^^^^^^^^^^^^^^^^
    ):
    ^
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 2730, in _query_iterator
    data = cursor.fetchmany(chunksize)
 pyodbc.OperationalError: ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.\r\n (10060) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10060)')
 ================================