Thursday, July 13, 2017

How to find Logical duplicate records in Type-2?

Compile following procedure using procedure code provided below.

Procedure Code: LOGICAL_DUP_CHECKER

CREATE OR REPLACE PROCEDURE DW_DF.LOGICAL_DUP_CHECKER

    (

        schema_name_i IN VARCHAR2,

        table_name_i IN VARCHAR2,

        natural_keylist_i IN VARCHAR2,-- comma delimited column list

        metadata_collist_i IN VARCHAR2, -- comma delimited column list

        ocolumn_name_i IN VARCHAR2

    )

    AS

        TYPE Sql_Lines IS TABLE OF VARCHAR2(1000);

        v_ld_select_sql_lines Sql_Lines;

        v_curr_line number(3);

        v_partition_clause varchar2(1000);

        -- UPPER all input fields

        schema_name_in VARCHAR2(100):=UPPER(schema_name_i);

        table_name_in VARCHAR2(100):=UPPER(table_name_i);

        natural_keylist_in VARCHAR2(1000):=UPPER(natural_keylist_i);

        metadata_collist_in VARCHAR2(1000):=UPPER(metadata_collist_i);

        ocolumn_name_in VARCHAR2(100):=UPPER(ocolumn_name_i);

        CURSOR c_ColumnNames IS

            SELECT

                ATC.COLUMN_NAME,

                ATC.DATA_TYPE

            FROM

                ALL_TAB_COLUMNS ATC

            WHERE

                ATC.OWNER = schema_name_in AND

                ATC.TABLE_NAME = table_name_in AND

                ATC.COLUMN_NAME NOT IN (select regexp_substr(metadata_collist_in,'[^,]+', 1, level) from dual

                    connect by regexp_substr(metadata_collist_in, '[^,]+', 1, level) is not null) AND

                ATC.COLUMN_NAME NOT IN (select regexp_substr(natural_keylist_in,'[^,]+', 1, level) from dual

                    connect by regexp_substr(natural_keylist_in, '[^,]+', 1, level) is not null)

            ORDER BY

                ATC.COLUMN_ID;


        COLUMN_NAMES c_ColumnNames%ROWTYPE;

        BEGIN

            dbms_output.enable(1000000);

            v_curr_line:= 1;

            v_ld_select_sql_lines:= Sql_Lines('---- Logical Duplicate Query for table ' || schema_name_in ||'.'|| table_name_in ||'----- ');


            -- Provide output info on Input

            dbms_output.put_line ('Schema Name: ' || schema_name_in);

            dbms_output.put_line ('Table Name: ' || table_name_in);

            dbms_output.put_line ('Natural Column List Starts');

            FOR

                i IN

                (SELECT REGEXP_SUBSTR (natural_keylist_in,

                                       '[^,]+',

                                       1,

                                       LEVEL) column_name

                 FROM dual

                 CONNECT BY REGEXP_SUBSTR (natural_keylist_in,

                                           '[^,]+',

                                           1,

                                           LEVEL) IS NOT NULL) LOOP

                dbms_output.put_line ('Natural Column : ' || i.column_name);

            END LOOP;

            dbms_output.put_line ('Natural Column List Completed');

            dbms_output.put_line ('Order Column Name: ' || ocolumn_name_in);



            dbms_output.put_line ('========================Logical Dups SQL========================');


            -- Started creating sql

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'SELECT * FROM ';


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= '(';


            -- Natural Key part

            -- Also creating partition clause with same loop

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'SELECT ';

            FOR

                i IN

                (SELECT REGEXP_SUBSTR (natural_keylist_in,

                                       '[^,]+',

                                       1,

                                       LEVEL) column_name

                 FROM dual

                 CONNECT BY REGEXP_SUBSTR (natural_keylist_in,

                                           '[^,]+',

                                           1,

                                           LEVEL) IS NOT NULL) LOOP

                v_curr_line:= v_curr_line+1;

                v_ld_select_sql_lines.extend;

                v_ld_select_sql_lines(v_curr_line):= i.column_name || ',';

                v_partition_clause:= v_partition_clause || i.column_name || ',';

            END LOOP;

            v_partition_clause:= substr(v_partition_clause,0,length(v_partition_clause)-1);

            v_partition_clause:= 'PARTITION BY ' || v_partition_clause || ' ORDER BY ' || ocolumn_name_in || ' ASC ';

            dbms_output.put_line('Partition cluase: ' || v_partition_clause);

            -- Natural Key part completed


            -- Add Order key for select part

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'N_' || ocolumn_name_in || ' AS ' || ocolumn_name_in;


            -- Add compare part to select sql

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= ',CASE WHEN ';


            OPEN c_ColumnNames;

            FETCH c_ColumnNames INTO COLUMN_NAMES;

            LOOP

            EXIT WHEN c_ColumnNames%NOTFOUND;


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'C_' || COLUMN_NAMES.COLUMN_NAME || ' =  ' || 'N_' || COLUMN_NAMES.COLUMN_NAME || ' AND ';


            FETCH c_ColumnNames INTO COLUMN_NAMES;

            END LOOP;

            CLOSE c_ColumnNames;


            -- Remove AND from last select line

            v_ld_select_sql_lines(v_curr_line):= replace(v_ld_select_sql_lines(v_curr_line),' AND ',' ');


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'THEN ''LOGICAL DUPS'' ';


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'ELSE ''NO DUPS'' END DUP_OR_NOT ';


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'FROM ( ';

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'SELECT ';

            FOR

                i IN

                (SELECT REGEXP_SUBSTR (natural_keylist_in,

                                       '[^,]+',

                                       1,

                                       LEVEL) column_name

                 FROM dual

                 CONNECT BY REGEXP_SUBSTR (natural_keylist_in,

                                           '[^,]+',

                                           1,

                                           LEVEL) IS NOT NULL) LOOP

                v_curr_line:= v_curr_line+1;

                v_ld_select_sql_lines.extend;

                v_ld_select_sql_lines(v_curr_line):= i.column_name || ',';

            END LOOP;


            -- SQL Top part completed


            -- Order by column current and next value

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= 'NVL (' || ocolumn_name_in || ', -1) C_' || ocolumn_name_in;


            -- Going forward sql will use , before columns details

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):= ',NVL(LEAD(' || ocolumn_name_in || ', 1, -1) OVER (' || v_partition_clause || '), -1) N_' || ocolumn_name_in ;


            -- based on column type apply NVL for each column that needs to be compared for logical dups

            OPEN c_ColumnNames;

            FETCH c_ColumnNames INTO COLUMN_NAMES;

            LOOP

            EXIT WHEN c_ColumnNames%NOTFOUND;


            -- Order by column current and next value



            -- Going forward sql will use , before columns details


            --dbms_output.put_line('For column: ' || COLUMN_NAMES.COLUMN_NAME || ' Data_type is ' ||COLUMN_NAMES.DATA_TYPE);


            CASE

                WHEN COLUMN_NAMES.DATA_TYPE = 'NUMBER' THEN

                BEGIN

                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL (' || COLUMN_NAMES.COLUMN_NAME || ', -1000) C_' || COLUMN_NAMES.COLUMN_NAME;


                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL(LEAD(' || COLUMN_NAMES.COLUMN_NAME || ', 1, -1000) OVER (' || v_partition_clause || '), -1000) N_' || COLUMN_NAMES.COLUMN_NAME ;

               END;

               WHEN COLUMN_NAMES.DATA_TYPE = 'CHAR' OR COLUMN_NAMES.DATA_TYPE = 'VARCHAR2' THEN

               BEGIN

                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL (' || COLUMN_NAMES.COLUMN_NAME || ', ''$####$'' ) C_' || COLUMN_NAMES.COLUMN_NAME;


                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL(LEAD(' || COLUMN_NAMES.COLUMN_NAME || ', 1, ''$####$'') OVER (' || v_partition_clause || '), ''$####$'') N_' || COLUMN_NAMES.COLUMN_NAME ;

               END;

               WHEN COLUMN_NAMES.DATA_TYPE = 'DATE' THEN

               BEGIN

                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL (' || COLUMN_NAMES.COLUMN_NAME || ', ''01-JAN-1899'' ) C_' || COLUMN_NAMES.COLUMN_NAME;


                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL(LEAD(' || COLUMN_NAMES.COLUMN_NAME || ', 1, ''01-JAN-1899'') OVER (' || v_partition_clause || '), ''01-JAN-1899'') N_' || COLUMN_NAMES.COLUMN_NAME ;

               END;
             
               WHEN COLUMN_NAMES.DATA_TYPE = 'TIMESTAMP(6)' THEN

               BEGIN

                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL (' || COLUMN_NAMES.COLUMN_NAME || ', ''01-JAN-1899'' ) C_' || COLUMN_NAMES.COLUMN_NAME;


                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= ',NVL(LEAD(' || COLUMN_NAMES.COLUMN_NAME || ', 1, ''01-JAN-1899'') OVER (' || v_partition_clause || '), ''01-JAN-1899'') N_' || COLUMN_NAMES.COLUMN_NAME ;

               END;

               ELSE

               BEGIN

                     v_curr_line:= v_curr_line+1;

                     v_ld_select_sql_lines.extend;

                     v_ld_select_sql_lines(v_curr_line):= 'Got Confuse with Column''s datatype';

              END;

            END CASE;


            FETCH c_ColumnNames INTO COLUMN_NAMES;

            END LOOP;

            CLOSE c_ColumnNames;


            -- Add end part

            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):=' FROM ' || schema_name_in ||'.'|| table_name_in || '))';


            v_curr_line:= v_curr_line+1;

            v_ld_select_sql_lines.extend;

            v_ld_select_sql_lines(v_curr_line):=' WHERE DUP_OR_NOT = ''LOGICAL DUPS'';';

            dbms_output.put_line('Output using collection');

            FOR i IN v_ld_select_sql_lines.FIRST .. v_ld_select_sql_lines.LAST

            LOOP

                dbms_output.put_line(v_ld_select_sql_lines(i));

            END LOOP;

            dbms_output.put_line('Output using collection');

            -- Create

        END;

/

GRANT EXECUTE ON DW_DF.LOGICAL_DUP_CHECKER TO DW_TESTER

/

To find logical duplicate:
Logical duplicates are the duplicates that occur in Data warehouse as a result of unnecessary daily column updates from source , which are not required for our tables. Apart from metadata columns like (FROM_DT_ID, THRU_DT_ID, OPEN_DT_ID, CLOSE_DT_ID, CREATE_DT, MOD_DT, CREATE_BY_NM, MOD_BY_NM or any columns that are driven out of Informatica )all columns look alike with in immediate rows for particular Natural key combination. To find those rows, please follow below instructions.
In ODSU or in DIMU, there is a procedure package name DW_DF.LOGICAL_DUP_CHECKER. Right click on that procedure and click Execute. Procedure execution window appears. For any questions regarding the procedure execution or errors, please ask Chirag Patel, author of the Procedure.
*Note:  If there is a refresh of Databases, this procedure will be lost, please ask to redeploy this procedure in both ODSU and DIMU.



1.      Type the schema name: DW_ODS or DW_MART – Which ever table schema you are using. For
2.      Type the Table name you want to find the logical duplicates.
3.      Type in the Natural key combination list for the table that you want to run against.
4.      Type the metadata column list for that particular table. For eg: FROM_DT_ID, THRU_DT_ID, OPEN_DT_ID, CLOSE_DT_ID, CREATE_DT, MOD_DT, CREATE_BY_NM, MOD_BY_NM
5.      Ocolumn_name is the order by column name list that you want to sort.

Click Execute button.

A query will be provided with which we can find the logical duplicates.

No comments: