--SQLBits Demos 2014 *****************************SCENARIO***************************************** We have some statement data in SQL Server that we want to scrub. Our warehouse is in Hadoop so we want the data to be available there We also want to push back the data to SQL Server once it is cleansed *****************************SCENARIO***************************************** *****************************WORKFLOW***************************************** SQOOP statement table from SQL Server into Hadoop (statementpreclean) PIG directory removing the rows where statementamount = 0 PIG the cleansed data into statementclean HIVE the statementclean table into statement SQOOP the statementclean table into SQL Server *****************************WORKFLOW***************************************** *****************************STEPS***************************************** ******SQOOP ** show connecting for table list sqoop list-tables --connect "jdbc:sqlserver://localhost;database=contoso;" --username sqoop -P ** import table with one statement //get rid of directory first sqoop import --connect "jdbc:sqlserver://localhost;database=contoso" --username sqoop -P --query "select statement_no, member_no, statement_dt, due_dt, statement_amt from statement WHERE $CONDITIONS" --target-dir /user/demo/statementpreclean -m 1 ** import table with 4 statements sqoop import --connect "jdbc:sqlserver://localhost;database=contoso" --username sqoop -P --query "select statement_no, member_no, statement_dt, due_dt, statement_amt from statement WHERE $CONDITIONS" --split-by statement_no --target-dir /user/demo/statementpreclean -m 4 ** show differences in profiler ********PIG predata = LOAD '/user/demo/statementpreclean/part*' USING PigStorage(',') AS (statementno:int, memberno:int, statementdate:chararray, duedate:chararray, statementamount:double); filt = FILTER predata BY statementamount > 0.00; limited = LIMIT filt 10; DUMP limited; ** just dump to the console ** show that the statementclean table is empty ***********HIVE CREATE EXTERNAL TABLE member ( memberno int, lastname string, firstname string, regionno int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/demo/member/'; CREATE EXTERNAL TABLE region ( regionno int, regionname string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/demo/region/'; describe formatted member describe formatted region CREATE EXTERNAL TABLE statement ( statementno int, memberno int, statementdate string, duedate string, statementamount double ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/demo/statement/' CREATE EXTERNAL TABLE statementclean ( statementno int, memberno int, statementdate string, duedate string, statementamount double ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/user/demo/statementclean/' SELECT * FROM statement LIMIT 10; SELECT * FROM statementclean LIMIT 10; *****************PIG **now make sure the cleansed directory is not there and push the data into the clean directory fs -rmr /user/demo/statementclean; fs -rmr /user/demo/statement; STORE filt INTO '/user/demo/statementclean' USING PigStorage(','); ***************HIVE INSERT INTO TABLE statement SELECT * FROM statementclean; SELECT * FROM statement; ** now let's look at the data altogether SELECT CONCAT(m.firstname, " ", m.lastname) as Name, AVG(s.statementamount) as statementamount FROM member m JOIN statement s ON m.memberno = s.memberno GROUP BY CONCAT(m.firstname, " ", m.lastname); SELECT r.regionname as Name, AVG(s.statementamount) as statementamount FROM region r JOIN member m ON r.regionno = m.regionno JOIN statement s ON m.memberno = s.memberno GROUP BY r.regionname; **SQOOP the clean data back to SQL Server //truncate the statement2 table sqoop export --connect "jdbc:sqlserver://localhost;database=contoso" --table statement2 --export-dir /user/demo/statementclean/ --username sqoop --P -m 1 --fields-terminated-by "," --lines-terminated-by "\n"