$ bin/hive hive> create database oracle; hive> dfs -ls /user/hive/warehouse; drwxr-xr-x - jack supergroup 0 2020-03-30 22:43 /user/hive/warehouse/oracle.db

  1. <a name="tXyoo"></a>
  2. ## java action传参
  3. java程序:传递运行参数开始时间、时间间隔,根据计算导入时间区间。参考oozie自带示例“java-main”。
  4. ```bash
  5. $ cp -r examples/apps/java-main/ oozie-apps/java-main-capture
  6. $ touch lib/MyDateRangeWithOozie.java

MyDateRangeWithOozie.java

1.7版本

  1. import java.io.File;
  2. import java.io.FileOutputStream;
  3. import java.io.IOException;
  4. import java.io.OutputStream;
  5. import java.text.ParseException;
  6. import java.text.SimpleDateFormat;
  7. import java.util.Date;
  8. import java.util.Properties;
  9. import org.apache.hadoop.fs.FileSystem;
  10. import org.apache.hadoop.fs.Path;
  11. import org.apache.hadoop.fs.FileStatus;
  12. import org.apache.hadoop.conf.Configuration;
  13. public class MyDateRangeWithOozie {
  14. private static final String OOZIE_ACTION_OUTPUT_PROPERTIES = "oozie.action.output.properties";
  15. private static final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  16. public static String addHoursFromStart(String start, int hours) throws ParseException {
  17. Date date = format.parse(start);
  18. long time = date.getTime();
  19. date.setTime(time + hours * 60 * 60 * 1000);
  20. String end = format.format(date);
  21. return end;
  22. }
  23. public static int getFileNum(FileSystem fs, Path path) throws IOException {
  24. if (!fs.exists(path))
  25. return 0;
  26. FileStatus[] files = fs.listStatus(path);
  27. return files.length;
  28. }
  29. public static void main(String[] args) throws IOException, ParseException {
  30. // System.out.println(addHoursFromStart("2016-3-12 00:00:00", 12));
  31. // 数据导入目录 时间间隔 开始时间
  32. if (args.length != 3) {
  33. System.out.println(
  34. "Usage: MyDateRangeWithOozie " +
  35. "<target-dir path> " +
  36. "<start datetime> " +
  37. "<interval hour>");
  38. System.exit(1);
  39. }
  40. System.out.println("# Arguments: " + args.length);
  41. for (int i = 0; i < args.length; i++) {
  42. System.out.println("Argument[" + i + "]: " + args[i]);
  43. }
  44. // /user/sqoop/user
  45. String dirPath = args[0];
  46. // 2
  47. int hours = Integer.parseInt(args[1]);
  48. // 2016-3-12 00:00:00
  49. String start = args[2];
  50. String end;
  51. Configuration conf = new Configuration();
  52. FileSystem fs = FileSystem.get(conf);
  53. Path path = new Path(dirPath);
  54. int num = getFileNum(fs, path);
  55. end = addHoursFromStart(start, hours * (num+1));
  56. start = addHoursFromStart(start, hours * num);
  57. String oozieProp = System.getProperty(OOZIE_ACTION_OUTPUT_PROPERTIES);
  58. if (oozieProp != null) {
  59. File propFile = new File(oozieProp);
  60. Properties props = new Properties();
  61. props.setProperty("start", start);
  62. props.setProperty("end", end);
  63. OutputStream os = new FileOutputStream(propFile);
  64. props.store(os, "");
  65. os.flush();
  66. os.close();
  67. } else
  68. throw new RuntimeException(OOZIE_ACTION_OUTPUT_PROPERTIES
  69. + " System property not defined");
  70. }
  71. }

workflow.xml

  • 注意start、end的值。 ```xml
    1. <java>
    2. <job-tracker>${jobTracker}</job-tracker>
    3. <name-node>${nameNode}</name-node>
    4. <configuration>
    5. <property>
    6. <name>mapred.job.queue.name</name>
    7. <value>${queueName}</value>
    8. </property>
    9. </configuration>
    10. <main-class>MyDateRangeWithOozie</main-class>
    11. <arg>${sqoopTargetDir}</arg>
    12. <arg>${intervalHours}</arg>
    13. <arg>${startTime}</arg>
    14. <capture-output/>
    15. </java>
    16. <ok to="check-output"/>
    17. <error to="fail"/>
    1. <switch>
    2. <case to="end">
    3. ${wf:actionData('java-node')['end'] eq '2016-03-11 12:00:00'}
    4. </case>
    5. <default to="fail-output"/>
    6. </switch>
    1. <message>Java failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    1. <message>start [${wf:actionData('java-node')['start']}]; end [${wf:actionData('java-node')['end']}];</message>

  1. <a name="AKUFc"></a>
  2. ### job.properties
  3. ```java
  4. nameNode=hdfs://192.168.32.130:8020
  5. jobTracker=192.168.32.130:8032
  6. queueName=default
  7. examplesRoot=oozie-apps
  8. oozie.wf.application.path=${nameNode}/user/${user.name}/${examplesRoot}/java-main-capture
  9. sqoopTargetDir=/user/hive/warehouse/oracle.db/h_log_lotbase
  10. intervalHours=12
  11. startTime=2016-3-11 00:00:00

运行测试

编译打jar包,运行测试:

  1. # 独立(本地)模式安装的Hadoop目录
  2. $ export HADOOP_HOME=~/Documents/hadoop/
  3. # 查找编译时需要用到的class文件,添加到CLASSPATH
  4. $ export CLASSPATH=$CLASSPATH:$HADOOP_HOME/share/hadoop/common/hadoop-common-2.5.0-cdh5.3.6.jar
  5. $ cd oozie-apps/java-main-capture/lib/
  6. $ javac MyDateRangeWithOozie.java
  7. $ jar -cvf MyDateRangeWithOozie.jar MyDateRangeWithOozie.class
  8. added manifest
  9. adding: MyDateRangeWithOozie.class(in = 3059) (out= 1646)(deflated 46%)
  10. $ cd ../../../
  11. $ ~/Documents/hadoop/bin/hadoop fs -rm -r -f oozie-apps/java-main-capture/
  12. $ ~/Documents/hadoop/bin/hadoop fs -put -f oozie-apps/java-main-capture/ oozie-apps
  13. $ export OOZIE_URL="http://192.168.32.130:11000/oozie"
  14. $ bin/oozie job -config oozie-apps/java-main-capture/job.properties -run -verbose -debug
  15. job: 0000000-200518015706412-oozie-jack-W
  16. $ bin/oozie job -info 0000000-200518015706412-oozie-jack-W

sqoop action

job.properties

添加下面一行

  1. oozie.use.system.libpath=true

workflow.xml

更改action节点,增加sqoop任务。与前面比较:

  • 删去decision action与kill action

    <workflow-app xmlns="uri:oozie:workflow:0.2" name="java-main-wf">
      <start to="java-node"/>
      <action name="java-node">
          <java>
              <job-tracker>${jobTracker}</job-tracker>
              <name-node>${nameNode}</name-node>
              <configuration>
                  <property>
                      <name>mapred.job.queue.name</name>
                      <value>${queueName}</value>
                  </property>
              </configuration>
              <main-class>MyDateRangeWithOozie</main-class>
              <arg>${sqoopTargetDir}</arg>
              <arg>${intervalHours}</arg>
              <arg>${startTime}</arg>
              <capture-output/>
          </java>
          <ok to="sqoop-freeform-node"/>
          <error to="fail"/>
      </action>
    
      <action name="sqoop-freeform-node">
          <sqoop xmlns="uri:oozie:sqoop-action:0.2">
              <job-tracker>${jobTracker}</job-tracker>
              <name-node>${nameNode}</name-node>
              <prepare>
                  <mkdir path="${sqoopTargetDir}"/>
              </prepare>
              <configuration>
                  <property>
                      <name>mapred.job.queue.name</name>
                      <value>${queueName}</value>
                  </property>
              </configuration>
              <arg>import</arg>
              <arg>--connect</arg>
              <arg>jdbc:oracle:thin:@//192.168.1.38:1521/CMASPROD</arg>
              <arg>--username</arg>
              <arg>mes_bc</arg>
              <arg>--password-file</arg>
              <arg>38oracle.pwd</arg>
              <arg>--target-dir</arg>
              <arg>${sqoopTargetDir}</arg>
              <arg>--mysql-delimiters</arg>
              <arg>--table</arg>
              <arg>H_LOG_LOTBASE</arg>
              <arg>--where</arg>
              <arg><![CDATA[ TIME_LOGGED >= to_date('${wf:actionData("java-node")["start"]}', 'yyyy-mm-dd HH24:MI:SS') and TIME_LOGGED < to_date('${wf:actionData("java-node")["end"]}', 'yyyy-mm-dd HH24:MI:SS') ]]></arg>
              <arg>--append</arg>
              <arg>-m</arg>
              <arg>1</arg>
              <archive>lib/ojdbc7.jar</archive>
          </sqoop>
          <ok to="end"/>
          <error to="fail"/>
      </action>
      <kill name="fail">
          <message>Workflow failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
      </kill>
      <end name="end"/>
    </workflow-app>
    

    运行测试

    ```bash $ cd ~/Documents/sqoop $ echo -n “mes_bc_real” > 38oracle.pwd $ ~/Documents/hadoop/bin/hadoop fs -put 38oracle.pwd

$ cd ~/Documents/oozie

拷贝jdbc驱动

$ cp ~/Documents/sqoop/lib/ojdbc7.jar oozie-apps/java-main-capture/lib/ $ ~/Documents/hadoop/bin/hadoop fs -rm -r -f oozie-apps/java-main-capture/ $ ~/Documents/hadoop/bin/hadoop fs -put -f oozie-apps/java-main-capture/ oozie-apps $ export OOZIE_URL=”http://192.168.32.130:11000/oozie

第一次导入,数据<2016-3-11 00:00:00, 2016-3-11 12:00:00>

$ bin/oozie job -config oozie-apps/java-main-capture/job.properties -run -verbose -debug job: 0000001-200518015706412-oozie-jack-W $ bin/oozie job -info 0000001-200518015706412-oozie-jack-W

第二次导入,数据<2016-3-11 12:00:00, 2016-3-12 00:00:00>

$ bin/oozie job -config oozie-apps/java-main-capture/job.properties -run -verbose -debug

列出文件

$ ~/Documents/hadoop/bin/hadoop fs -ls /user/sqoop/oracle/h_log_lotbase Found 2 itemsk0 -rw-r—r— 1 jack supergroup 723872 2020-05-19 18:36 /user/sqoop/oracle/h_log_lotbase/part-m-00000 -rw-r—r— 1 jack supergroup 695022 2020-05-19 18:43 /user/sqoop/oracle/h_log_lotbase/part-m-00001

查看head

$ ~/Documents/hadoop/bin/hadoop fs -cat /user/sqoop/oracle/h_log_lotbase/part-m-00000 | head a1e8dc24-40a0-4536-aeea-918890b7130a,2016-03-11 08:58:26.0,COMPLETE,11,3,2016,11,null,null,L216-1603080205,L216-1603080205,2310216030370,8,2016-03-11 08:58:26.0,A1301005,MACPRO_NORMAL_01,ROUTE_SET,ROUTE_SET-B01-01,FALSE,216-1600298,null,8,8,0,0,MACPRO_NORMAL_01,MACPRO_NORMAL_01,ROUTE_SET,PROGRAM,INITIAL,WAIT,null,2016-03-11 08:58:26.0,2016-03-11 08:58:26.0,null,RELEASED,ACTIVE,PCS,2016-03-13 00:00:00.0,null,null,null,null,null,null,null,null ··· ··· $ ~/Documents/hadoop/bin/hadoop fs -cat /user/sqoop/oracle/h_log_lotbase/part-m-00001 | head 0b9e1276-dacf-49f4-b999-b7330c1a1125,2016-03-11 13:00:18.0,COMPLETE,11,3,2016,11,null,null,L216-1602260340,L216-1602260340,2310215121893,2,2016-03-11 13:00:18.0,A1504063,MACPRO_NORMAL_01,MACPRC,CNC-A01-08,FALSE,216-1600235,null,2,2,0,0,MACPRO_NORMAL_01,MACPRO_NORMAL_01,MACPRC,MACPRC,IN_PROCESS,WAIT,null,2016-03-11 11:05:44.0,2016-03-11 13:00:18.0,null,RELEASED,ACTIVE,PCS,2016-03-15 00:00:00.0,null,null,null,null,null,null,null,null ··· ···

<a name="gdIyR"></a>
## hive操作
```bash
$ cd ~/Documents/hive/
$ bin/hive --service metastore

$ bin/hive
hive> use oracle;
hive> show tables;

# 查看导入的数据文件
hive> dfs -ls /user/hive/warehouse/oracle.db/h_log_lotbase;

# 创建(外部)表
hive> create external table h_log_lotbase(
log_id string,
time_logged string,
transaction_name string,
calendar_day string,
calendar_month string,
calendar_year string,  
calendar_week string,
calendar_shift string,
comment_text string,
lot_id string,
sublot_id string,
part_no string,
qty double,
lm_time string,
lm_user string,
route_id string,
op_id string,
station_id string,
undone string,
wo_id_mo string,
ticket_id string,
output_qty double,
output_good_qty double,
output_ng_qty double,
scrapt_qty double,
from_route_id string,
to_route_id string,
from_op_id string,
to_op_id string,
from_work_state string,
to_work_state string,
pre_complete_time string,
start_time string,
complete_time string,
line_id string,
hold_status string,
quarantine_status string,
uom string,
due_date string,
stockin_ticket string,
spot_check_qty double,
fetch_qty double,
rou_oper_code string,
rou_oper_name string,
rou_oper_seq string,
session_id string,
scrapt_sublot string
)
row format delimited 
fields terminated by ','
lines terminated by '\n';
hive> show tables;

# 导入数据文件
hive> load data inpath '/user/hive/warehouse/oracle.db/h_log_lotbase' into table h_log_lotbase;
# load data inpath '/user/sqoop/oracle/h_log_lotbase' into table h_log_lotbase;

# 分析数据
hive> 
# 前10个(距今最近时间段) time_logged
select time_logged from h_log_lotbase order by time_logged desc limit 2;
2016-03-11 22:47:25.0
2016-03-11 22:38:50.0
# 前10个(距今最远时间段) time_logged
select time_logged from h_log_lotbase order by time_logged limit 2;
2016-03-11 08:35:24.0
2016-03-11 08:35:24.0
#

sqoop创建表结构

$ bin/sqoop create-hive-table \
--connect jdbc:oracle:thin:@//192.168.1.38:1521/CMASPROD \
--username mes_bc \
--password-file 38oracle.pwd \
--mysql-delimiters \
--table H_LOG_LOTBASE