Journey_Plan data Import

2026-05-20 12:40:47 +05:30
parent 3be8cd7259
commit 579d59e1b0
8 changed files with 15087 additions and 432 deletions
@@ -1,5 +1,3 @@
-
-
 import pyodbc
 import pandas as pd
 import clickhouse_connect
@@ -7,6 +5,7 @@ import numpy as np
 from datetime import datetime
 import traceback
 import warnings
+import time

 # =========================================================
 # IGNORE WARNINGS
@@ -16,10 +15,12 @@ warnings.filterwarnings(
    'pandas only supports SQLAlchemy connectable'
 )

-print("ETL Started :", datetime.now())
+print("\n================================================")
+print("ETL STARTED :", datetime.now())
+print("================================================")

 # =========================================================
-# SQL SERVER CONNECTION
+# SQL SERVER CONNECTION STRING
 # =========================================================
 SQL_CONN_STR = (
    'DRIVER={ODBC Driver 17 for SQL Server};'
@@ -28,6 +29,7 @@ SQL_CONN_STR = (
    'UID=bsgteam_test;'
    'PWD=B$gt3@m#00512;'
    'TrustServerCertificate=yes;'
+    'Connection Timeout=60;'
 )

 # =========================================================
@@ -42,11 +44,67 @@ CH_CONFIG = {
 }

 # =========================================================
-# TABLE NAME
+# TABLE DETAILS
 # =========================================================
 TABLE_NAME = 'Journey_Plan'
 PROJECT_ID = 41654

+# =========================================================
+# SETTINGS
+# =========================================================
+TRUNCATE_BEFORE_LOAD = True
+table_truncated = False
+
+# =========================================================
+# CHUNK SIZE
+# =========================================================
+chunk_size = 20000
+
+# =========================================================
+# CONNECT SQL SERVER
+# =========================================================
+def connect_sql():
+
+    try:
+
+        conn = pyodbc.connect(
+            SQL_CONN_STR,
+            autocommit=True
+        )
+
+        print("Connected SQL Server")
+
+        return conn
+
+    except Exception as e:
+
+        print("SQL CONNECTION FAILED")
+        print(str(e))
+
+        raise
+
+# =========================================================
+# CONNECT CLICKHOUSE
+# =========================================================
+def connect_clickhouse():
+
+    try:
+
+        client = clickhouse_connect.get_client(
+            **CH_CONFIG
+        )
+
+        print("Connected ClickHouse")
+
+        return client
+
+    except Exception as e:
+
+        print("CLICKHOUSE CONNECTION FAILED")
+        print(str(e))
+
+        raise
+
 # =========================================================
 # CLEAN DATAFRAME
 # =========================================================
@@ -66,39 +124,46 @@ def clean_dataframe(df):

            try:

-                col_lower = col.lower()
+                print(f"\nCleaning Column : {col}")

                # =====================================
-                # DATE COLUMNS
+                # AUTO DETECT DATE / TIME COLUMNS
                # =====================================
-                if 'date' in col_lower:
+                if (
+                    'date' in col.lower()
+                    or 'time' in col.lower()
+                ):

-                    print(f"Cleaning Date Column : {col}")
+                    print(f"Date Column : {col}")

                    df[col] = pd.to_datetime(
                        df[col],
                        errors='coerce'
                    )

-                    # Remove invalid dates
-                    df[col] = df[col].where(
-                        (df[col].dt.year >= 1970) &
-                        (df[col].dt.year <= 2100)
-                    )
+                    cleaned_dates = []

-                    # Convert to Python Date
-                    df[col] = df[col].apply(
-                        lambda x:
-                        x.date()
-                        if pd.notnull(x)
-                        else None
-                    )
+                    for val in df[col]:
+
+                        if pd.isnull(val):
+
+                            cleaned_dates.append(None)
+
+                        else:
+
+                            cleaned_dates.append(
+                                val.to_pydatetime()
+                            )
+
+                    df[col] = cleaned_dates

                # =====================================
                # INTEGER COLUMNS
                # =====================================
                elif pd.api.types.is_integer_dtype(df[col]):

+                    print(f"Integer Column : {col}")
+
                    df[col] = pd.to_numeric(
                        df[col],
                        errors='coerce'
@@ -116,38 +181,27 @@ def clean_dataframe(df):
                # =====================================
                elif pd.api.types.is_float_dtype(df[col]):

-                    non_null = df[col].dropna()
+                    print(f"Float Column : {col}")

-                    # ---------------------------------
-                    # Convert whole float to int
-                    # Example:
-                    # 3240.0 -> 3240
-                    # ---------------------------------
-                    if len(non_null) > 0 and (
-                        (non_null % 1 == 0).all()
-                    ):
+                    df[col] = pd.to_numeric(
+                        df[col],
+                        errors='coerce'
+                    )

-                        df[col] = df[col].apply(
-                            lambda x:
-                            int(x)
-                            if pd.notnull(x)
-                            else None
-                        )
-
-                    else:
-
-                        df[col] = df[col].apply(
-                            lambda x:
-                            float(x)
-                            if pd.notnull(x)
-                            else None
-                        )
+                    df[col] = df[col].apply(
+                        lambda x:
+                        float(x)
+                        if pd.notnull(x)
+                        else None
+                    )

                # =====================================
                # OBJECT / STRING COLUMNS
                # =====================================
                else:

+                    print(f"String/Object Column : {col}")
+
                    cleaned = []

                    for val in df[col]:
@@ -157,6 +211,28 @@ def clean_dataframe(df):

                            cleaned.append(None)

+                        # DATETIME
+                        elif isinstance(
+                            val,
+                            (
+                                datetime,
+                                pd.Timestamp
+                            )
+                        ):
+
+                            if isinstance(
+                                val,
+                                pd.Timestamp
+                            ):
+
+                                cleaned.append(
+                                    val.to_pydatetime()
+                                )
+
+                            else:
+
+                                cleaned.append(val)
+
                        # INTEGER
                        elif isinstance(
                            val,
@@ -166,7 +242,9 @@ def clean_dataframe(df):
                            )
                        ):

-                            cleaned.append(int(val))
+                            cleaned.append(
+                                int(val)
+                            )

                        # FLOAT
                        elif isinstance(
@@ -183,41 +261,26 @@ def clean_dataframe(df):

                            else:

-                                # IMPORTANT FIX
-                                # Avoid '3240.0' string issue
-                                if val.is_integer():
-
-                                    cleaned.append(int(val))
-
-                                else:
-
-                                    cleaned.append(float(val))
-
-                        # STRING
-                        elif isinstance(val, str):
-
-                            cleaned.append(val.strip())
+                                cleaned.append(
+                                    float(val)
+                                )

                        # BOOLEAN
-                        elif isinstance(val, bool):
-
-                            cleaned.append(int(val))
-
-                        # DATETIME
                        elif isinstance(
                            val,
-                            (
-                                datetime,
-                                pd.Timestamp
-                            )
+                            bool
                        ):

-                            cleaned.append(str(val))
+                            cleaned.append(
+                                int(val)
+                            )

-                        # OTHER
+                        # STRING
                        else:

-                            cleaned.append(str(val))
+                            cleaned.append(
+                                str(val).strip()
+                            )

                    df[col] = cleaned

@@ -247,16 +310,12 @@ try:
    # =====================================================
    # CONNECT SQL SERVER
    # =====================================================
-    sql_conn = pyodbc.connect(SQL_CONN_STR)
-
-    print("Connected to SQL Server")
+    sql_conn = connect_sql()

    # =====================================================
    # CONNECT CLICKHOUSE
    # =====================================================
-    ch_client = clickhouse_connect.get_client(**CH_CONFIG)
-
-    print("Connected to ClickHouse")
+    ch_client = connect_clickhouse()

    # =====================================================
    # QUERY
@@ -267,125 +326,214 @@ try:
    WHERE Project_Id = {PROJECT_ID}
    """

-    print("\nExecuting Query:")
+    print("\nExecuting Query")
    print(query)

    # =====================================================
-    # CHUNK SIZE
+    # RETRY SETTINGS
    # =====================================================
-    chunk_size = 100000
-
-    total_rows = 0
+    retry_count = 0
+    max_retry = 5

    # =====================================================
-    # READ DATA
+    # MAIN RETRY LOOP
    # =====================================================
-    for chunk in pd.read_sql(
-        query,
-        sql_conn,
-        chunksize=chunk_size
-    ):
+    while retry_count < max_retry:

        try:

-            print("\n================================")
-            print(f"Processing {len(chunk)} Rows")
-            print("================================")
+            # =============================================
+            # READ SQL DATA
+            # =============================================
+            for chunk in pd.read_sql(
+                query,
+                sql_conn,
+                chunksize=chunk_size
+            ):

-            # =================================================
-            # CLEAN DATA
-            # =================================================
-            chunk = clean_dataframe(chunk)
-
-            # =================================================
-            # DEBUG COLUMN TYPES
-            # =================================================
-            print("\nCOLUMN TYPES")
-
-            for col in chunk.columns:
-
-                sample = chunk[col].dropna()
-
-                if len(sample) > 0:
+                try:

+                    print("\n================================")
                    print(
-                        col,
-                        type(sample.iloc[0]),
-                        sample.iloc[0]
+                        f"Processing Rows : "
+                        f"{len(chunk)}"
+                    )
+                    print("================================")
+
+                    # =====================================
+                    # CLEAN DATA
+                    # =====================================
+                    chunk = clean_dataframe(chunk)
+
+                    # =====================================
+                    # DEBUG COLUMN TYPES
+                    # =====================================
+                    print("\nCOLUMN DATATYPES")
+                    print(chunk.dtypes)
+
+                    print("\nCOLUMN SAMPLE TYPES")
+
+                    for col in chunk.columns:
+
+                        sample = chunk[col].dropna()
+
+                        if len(sample) > 0:
+
+                            print(
+                                col,
+                                type(sample.iloc[0]),
+                                sample.iloc[0]
+                            )
+
+                    # =====================================
+                    # TRUNCATE TABLE FIRST TIME ONLY
+                    # =====================================
+                    if (
+                        TRUNCATE_BEFORE_LOAD
+                        and
+                        not table_truncated
+                    ):
+
+                        print("\n================================")
+                        print(
+                            f"TRUNCATING TABLE : "
+                            f"{TABLE_NAME}"
+                        )
+                        print("================================")
+
+                        # IMPORTANT FIX
+                        truncate_query = f"""
+                        TRUNCATE TABLE
+                        `{CH_CONFIG['database']}`.`{TABLE_NAME}`
+                        """
+
+                        ch_client.command(
+                            truncate_query
+                        )
+
+                        print(
+                            "TABLE TRUNCATED SUCCESSFULLY"
+                        )
+
+                        table_truncated = True
+
+                    # =====================================
+                    # INSERT INTO CLICKHOUSE
+                    # =====================================
+                    print(
+                        "\nINSERTING INTO CLICKHOUSE..."
                    )

-            # =================================================
-            # SAMPLE DATA
-            # =================================================
-            print("\nSAMPLE DATA")
-            print(chunk.head(2))
+                    ch_client.insert_df(
+                        table=TABLE_NAME,
+                        df=chunk,
+                        database=CH_CONFIG['database']
+                    )

-            # =================================================
-            # INSERT INTO CLICKHOUSE
-            # =================================================
-            print("\nInserting into ClickHouse...")
+                    print(
+                        f"INSERTED : "
+                        f"{len(chunk)} ROWS"
+                    )

-            ch_client.insert_df(
-                table=TABLE_NAME,
-                df=chunk,
-                database=CH_CONFIG['database']
-            )
+                except Exception as insert_error:

-            total_rows += len(chunk)
+                    print("\n================================")
+                    print("INSERT FAILED")
+                    print("================================")

-            print(
-                f"\nInserted Total Rows : {total_rows}"
-            )
+                    print(str(insert_error))

-        except Exception as chunk_error:
+                    traceback.print_exc()
+
+                    # =================================
+                    # SAVE ERROR LOG
+                    # =================================
+                    with open(
+                        "insert_error.log",
+                        "a",
+                        encoding="utf-8"
+                    ) as log:
+
+                        log.write(
+                            "\n\n================================"
+                        )
+
+                        log.write(
+                            f"\nTIME : "
+                            f"{datetime.now()}"
+                        )
+
+                        log.write(
+                            f"\nTABLE : "
+                            f"{TABLE_NAME}"
+                        )
+
+                        log.write(
+                            f"\nERROR : "
+                            f"{str(insert_error)}"
+                        )
+
+                        log.write(
+                            f"\nTRACEBACK :\n"
+                            f"{traceback.format_exc()}"
+                        )
+
+                        log.write(
+                            "\n================================"
+                        )
+
+                    continue
+
+            # =============================================
+            # SUCCESS
+            # =============================================
+            break
+
+        # =================================================
+        # SQL CONNECTION FAILURE
+        # =================================================
+        except pyodbc.OperationalError as op_error:
+
+            retry_count += 1

            print("\n================================")
-            print("CHUNK INSERT FAILED")
+            print(
+                f"SQL CONNECTION LOST "
+                f"- RETRY {retry_count}"
+            )
            print("================================")

-            print(str(chunk_error))
+            print(str(op_error))
+
+            time.sleep(10)
+
+            try:
+
+                sql_conn.close()
+
+            except:
+                pass
+
+            # RECONNECT SQL
+            sql_conn = connect_sql()
+
+        # =================================================
+        # OTHER ERROR
+        # =================================================
+        except Exception as loop_error:
+
+            print("\n================================")
+            print("MAIN LOOP ERROR")
+            print("================================")
+
+            print(str(loop_error))

            traceback.print_exc()

-            # =============================================
-            # SAVE ERROR LOG
-            # =============================================
-            with open(
-                "clickhouse_chunk_error.log",
-                "a",
-                encoding="utf-8"
-            ) as log:
-
-                log.write(
-                    "\n\n================================"
-                )
-
-                log.write(
-                    f"\nTIME : {datetime.now()}"
-                )
-
-                log.write(
-                    f"\nTABLE : {TABLE_NAME}"
-                )
-
-                log.write(
-                    f"\nERROR : {str(chunk_error)}"
-                )
-
-                log.write(
-                    f"\nTRACEBACK :\n"
-                    f"{traceback.format_exc()}"
-                )
-
-                log.write(
-                    "\n================================"
-                )
-
-            continue
+            break

    print("\n================================")
    print("ETL COMPLETED SUCCESSFULLY")
-    print(f"TOTAL ROWS INSERTED : {total_rows}")
    print("================================")

 # =========================================================
@@ -402,7 +550,7 @@ except Exception as main_error:
    traceback.print_exc()

    with open(
-        "clickhouse_main_error.log",
+        "main_error.log",
        "a",
        encoding="utf-8"
    ) as log:
@@ -437,7 +585,7 @@ finally:

        sql_conn.close()

-        print("\nSQL Server Connection Closed")
+        print("\nSQL SERVER CONNECTION CLOSED")

    except:
        pass
@@ -446,9 +594,11 @@ finally:

        ch_client.close()

-        print("ClickHouse Connection Closed")
+        print("CLICKHOUSE CONNECTION CLOSED")

    except:
        pass

-print("\nETL Finished :", datetime.now())
+print("\n================================================")
+print("ETL FINISHED :", datetime.now())
+print("================================================")