Master_Salesterritorylayer Import

2026-05-19 13:13:23 +05:30
parent e1cd0d000d
commit 04f4384e6a
2 changed files with 509 additions and 0 deletions
@@ -0,0 +1,462 @@
 import pyodbc
 import pandas as pd
 import clickhouse_connect
 import numpy as np
 from datetime import datetime
 import traceback
 import warnings
 # =========================================================
 # IGNORE WARNINGS
 # =========================================================
 warnings.filterwarnings(
    'ignore',
    'pandas only supports SQLAlchemy connectable'
 )
 print("ETL Started :", datetime.now())
 # =========================================================
 # SQL SERVER CONNECTION
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=10.200.25.65;'
    'DATABASE=CPMIndiaBusinessInsight;'
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
 )
 # =========================================================
 # CLICKHOUSE CONFIG
 # =========================================================
 CH_CONFIG = {
    'host': '172.188.12.194',
    'port': 8123,
    'username': 'default',
    'password': 'dipanshu_k',
    'database': 'DaburIndia_BI'
 }
 # =========================================================
 # TABLE NAME
 # =========================================================
 TABLE_NAME = 'Master_Salesterritorylayer'
 PROJECTID = 41654
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
 def clean_dataframe(df):
    try:
        # ---------------------------------------------
        # Replace NaN with None
        # ---------------------------------------------
        df = df.replace({np.nan: None})
        # ---------------------------------------------
        # Process Column Wise
        # ---------------------------------------------
        for col in df.columns:
            try:
                print(f"\nCleaning Column : {col}")
                # =====================================
                # DATETIME COLUMNS
                # =====================================
                if pd.api.types.is_datetime64_any_dtype(df[col]):
                    print(f"Datetime Column Detected : {col}")
                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )
                    # Remove invalid dates
                    df[col] = df[col].where(
                        (df[col].dt.year >= 1970) &
                        (df[col].dt.year <= 2100)
                    )
                    # IMPORTANT FIX
                    # Convert to Python datetime
                    df[col] = df[col].apply(
                        lambda x:
                        x.to_pydatetime()
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):
                    print(f"Integer Column Detected : {col}")
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
                    )
                    df[col] = df[col].apply(
                        lambda x:
                        int(x)
                        if pd.notnull(x)
                        else None
                    )
                # =====================================
                # FLOAT COLUMNS
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):
                    print(f"Float Column Detected : {col}")
                    non_null = df[col].dropna()
                    # ---------------------------------
                    # Convert whole float to int
                    # Example:
                    # 3240.0 -> 3240
                    # ---------------------------------
                    if len(non_null) > 0 and (
                        (non_null % 1 == 0).all()
                    ):
                        df[col] = df[col].apply(
                            lambda x:
                            int(x)
                            if pd.notnull(x)
                            else None
                        )
                    else:
                        df[col] = df[col].apply(
                            lambda x:
                            float(x)
                            if pd.notnull(x)
                            else None
                        )
                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:
                    print(f"String/Object Column Detected : {col}")
                    cleaned = []
                    for val in df[col]:
                        # NULL
                        if pd.isnull(val):
                            cleaned.append(None)
                        # INTEGER
                        elif isinstance(
                            val,
                            (
                                int,
                                np.integer
                            )
                        ):
                            cleaned.append(int(val))
                        # FLOAT
                        elif isinstance(
                            val,
                            (
                                float,
                                np.floating
                            )
                        ):
                            if np.isnan(val):
                                cleaned.append(None)
                            else:
                                # Avoid 3240.0 issue
                                if val.is_integer():
                                    cleaned.append(int(val))
                                else:
                                    cleaned.append(float(val))
                        # STRING
                        elif isinstance(val, str):
                            cleaned.append(val.strip())
                        # BOOLEAN
                        elif isinstance(val, bool):
                            cleaned.append(int(val))
                        # DATETIME
                        elif isinstance(
                            val,
                            (
                                datetime,
                                pd.Timestamp
                            )
                        ):
                            cleaned.append(
                                val.to_pydatetime()
                                if isinstance(val, pd.Timestamp)
                                else val
                            )
                        # OTHER
                        else:
                            cleaned.append(str(val))
                    df[col] = cleaned
            except Exception as col_error:
                print("\n================================")
                print(f"COLUMN FAILED : {col}")
                print(str(col_error))
                print("================================")
        return df
    except Exception as clean_error:
        print("\n================================")
        print("DATA CLEAN FAILED")
        print(str(clean_error))
        print("================================")
        return df
 # =========================================================
 # MAIN PROCESS
 # =========================================================
 try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
    sql_conn = pyodbc.connect(SQL_CONN_STR)
    print("Connected to SQL Server")
    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
    print("Connected to ClickHouse")
    # =====================================================
    # QUERY
    # =====================================================
    query = f"""
    SELECT *
    FROM dbo.[{TABLE_NAME}]
    WHERE PROJECTID = {PROJECTID}
    """
    print("\nExecuting Query:")
    print(query)
    # =====================================================
    # CHUNK SIZE
    # =====================================================
    chunk_size = 100000
    total_rows = 0
    # =====================================================
    # READ DATA
    # =====================================================
    for chunk in pd.read_sql(
        query,
        sql_conn,
        chunksize=chunk_size
    ):
        try:
            print("\n================================")
            print(f"Processing {len(chunk)} Rows")
            print("================================")
            # =================================================
            # CLEAN DATA
            # =================================================
            chunk = clean_dataframe(chunk)
            # =================================================
            # DEBUG COLUMN TYPES
            # =================================================
            print("\nCOLUMN TYPES")
            for col in chunk.columns:
                sample = chunk[col].dropna()
                if len(sample) > 0:
                    print(
                        col,
                        type(sample.iloc[0]),
                        sample.iloc[0]
                    )
            # =================================================
            # SAMPLE DATA
            # =================================================
            print("\nSAMPLE DATA")
            print(chunk.head(2))
            # =================================================
            # INSERT INTO CLICKHOUSE
            # =================================================
            print("\nInserting into ClickHouse...")
            ch_client.insert_df(
                table=TABLE_NAME,
                df=chunk,
                database=CH_CONFIG['database']
            )
            total_rows += len(chunk)
            print(
                f"\nInserted Total Rows : {total_rows}"
            )
        except Exception as chunk_error:
            print("\n================================")
            print("CHUNK INSERT FAILED")
            print("================================")
            print(str(chunk_error))
            traceback.print_exc()
            # =============================================
            # SAVE ERROR LOG
            # =============================================
            with open(
                "clickhouse_chunk_error.log",
                "a",
                encoding="utf-8"
            ) as log:
                log.write(
                    "\n\n================================"
                )
                log.write(
                    f"\nTIME : {datetime.now()}"
                )
                log.write(
                    f"\nTABLE : {TABLE_NAME}"
                )
                log.write(
                    f"\nERROR : {str(chunk_error)}"
                )
                log.write(
                    f"\nTRACEBACK :\n"
                    f"{traceback.format_exc()}"
                )
                log.write(
                    "\n================================"
                )
            continue
    print("\n================================")
    print("ETL COMPLETED SUCCESSFULLY")
    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("================================")
 # =========================================================
 # MAIN ERROR
 # =========================================================
 except Exception as main_error:
    print("\n================================")
    print("MAIN ERROR")
    print("================================")
    print(str(main_error))
    traceback.print_exc()
    with open(
        "clickhouse_main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
        log.write(
            "\n\n================================"
        )
        log.write(
            f"\nTIME : {datetime.now()}"
        )
        log.write(
            f"\nERROR : {str(main_error)}"
        )
        log.write(
            f"\nTRACEBACK :\n"
            f"{traceback.format_exc()}"
        )
        log.write(
            "\n================================"
        )
 # =========================================================
 # CLOSE CONNECTIONS
 # =========================================================
 finally:
    try:
        sql_conn.close()
        print("\nSQL Server Connection Closed")
    except:
        pass
    try:
        ch_client.close()
        print("ClickHouse Connection Closed")
    except:
        pass
 print("\nETL Finished :", datetime.now())
@@ -43,3 +43,50 @@ Traceback (most recent call last):
 pyodbc.OperationalError: ('08S01', '[08S01] [Microsoft][ODBC Driver 17 for SQL Server]TCP Provider: An existing connection was forcibly closed by the remote host.\r\n (10054) (SQLGetData); [08S01] [Microsoft][ODBC Driver 17 for SQL Server]Communication link failure (10054)')
 ================================
 ================================
 TIME : 2026-05-19 13:10:48.622125
 ERROR : ('42S22', "[42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Project_Id'. (207) (SQLExecDirectW)")
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\Master_Salesterritorylayer_Import.py", line 291, in <module>
    for chunk in pd.read_sql(
                 ~~~~~~~~~~~^
        query,
        ^^^^^^
        sql_conn,
        ^^^^^^^^^
        chunksize=chunk_size
        ^^^^^^^^^^^^^^^^^^^^
    ):
    ^
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 702, in read_sql
    return pandas_sql.read_query(
           ~~~~~~~~~~~~~~~~~~~~~^
        sql,
        ^^^^
    ...<6 lines>...
        dtype=dtype,
        ^^^^^^^^^^^^
    )
    ^
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 2766, in read_query
    cursor = self.execute(sql, params)
  File "C:\Users\dipanshuk\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\pandas\io\sql.py", line 2702, in execute
    cur.execute(sql, *args)
    ~~~~~~~~~~~^^^^^^^^^^^^
 pyodbc.ProgrammingError: ('42S22', "[42S22] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Invalid column name 'Project_Id'. (207) (SQLExecDirectW)")
 ================================
 ================================
 TIME : 2026-05-19 13:11:33.086601
 ERROR : name 'PROJECT_ID' is not defined
 TRACEBACK :
 Traceback (most recent call last):
  File "d:\Python Code\Master_Salesterritorylayer_Import.py", line 275, in <module>
    WHERE Project_Id = {PROJECT_ID}
                        ^^^^^^^^^^
 NameError: name 'PROJECT_ID' is not defined. Did you mean: 'PROJECTID'?
 ================================