image.png

    1. 使用Flume-1监控文件变动,Flume-1将变动内容传递给Flume-2Flume-2负责存储到HDFS。同时Flume-1将变动内容传递给Flume-3Flume-3负责输出到Local FileSystem
    2. mkdir -p /opt/egg/apache-flume-1.7.0-bin/group1
    3. 配置1个接收日志文件的source和两个channel、两个sink,分别输送给flume-flume-hdfsflume-flume-dir
    4. vim flume-file-flume.conf
    5. # Name the components on this agent
    6. a1.sources = r1
    7. a1.sinks = k1 k2
    8. a1.channels = c1 c2
    9. # 将数据流复制给所有channel
    10. a1.sources.r1.selector.type = replicating
    11. # Describe/configure the source
    12. a1.sources.r1.type = exec
    13. a1.sources.r1.command = tail -F /opt/ha/hadoop-2.7.2/logs/hadoop-root-namenode-hadoop1.log
    14. a1.sources.r1.shell = /bin/bash -c
    15. # Describe the sink
    16. # sink端的avro是一个数据发送者
    17. a1.sinks.k1.type = avro
    18. a1.sinks.k1.hostname = hadoop1
    19. a1.sinks.k1.port = 4141
    20. a1.sinks.k2.type = avro
    21. a1.sinks.k2.hostname = hadoop1
    22. a1.sinks.k2.port = 4142
    23. # Describe the channel
    24. a1.channels.c1.type = memory
    25. a1.channels.c1.capacity = 1000
    26. a1.channels.c1.transactionCapacity = 100
    27. a1.channels.c2.type = memory
    28. a1.channels.c2.capacity = 1000
    29. a1.channels.c2.transactionCapacity = 100
    30. # Bind the source and sink to the channel
    31. a1.sources.r1.channels = c1 c2
    32. a1.sinks.k1.channel = c1
    33. a1.sinks.k2.channel = c2
    34. 输出是到HDFSSink
    35. vim flume-flume-hdfs.conf
    36. # Name the components on this agent
    37. a2.sources = r1
    38. a2.sinks = k1
    39. a2.channels = c1
    40. # Describe/configure the source
    41. # source端的avro是一个数据接收服务
    42. a2.sources.r1.type = avro
    43. a2.sources.r1.bind = hadoop1
    44. a2.sources.r1.port = 4141
    45. # Describe the sink
    46. a2.sinks.k1.type = hdfs
    47. a2.sinks.k1.hdfs.path = hdfs://hadoop1:9000/flume2/%Y%m%d/%H
    48. #上传文件的前缀
    49. a2.sinks.k1.hdfs.filePrefix = flume2-
    50. #是否按照时间滚动文件夹
    51. a2.sinks.k1.hdfs.round = true
    52. #多少时间单位创建一个新的文件夹
    53. a2.sinks.k1.hdfs.roundValue = 1
    54. #重新定义时间单位
    55. a2.sinks.k1.hdfs.roundUnit = hour
    56. #是否使用本地时间戳
    57. a2.sinks.k1.hdfs.useLocalTimeStamp = true
    58. #积攒多少个Event才flush到HDFS一次
    59. a2.sinks.k1.hdfs.batchSize = 100
    60. #设置文件类型,可支持压缩
    61. a2.sinks.k1.hdfs.fileType = DataStream
    62. #多久生成一个新的文件
    63. a2.sinks.k1.hdfs.rollInterval = 600
    64. #设置每个文件的滚动大小大概是128M
    65. a2.sinks.k1.hdfs.rollSize = 134217700
    66. #文件的滚动与Event数量无关
    67. a2.sinks.k1.hdfs.rollCount = 0
    68. # Describe the channel
    69. a2.channels.c1.type = memory
    70. a2.channels.c1.capacity = 1000
    71. a2.channels.c1.transactionCapacity = 100
    72. # Bind the source and sink to the channel
    73. a2.sources.r1.channels = c1
    74. a2.sinks.k1.channel = c1
    75. 输出是到本地目录的Sink:输出的本地目录必须是已经存在的目录,如果该目录不存在,并不会创建新的目录
    76. vim flume-flume-dir.conf
    77. # Name the components on this agent
    78. a3.sources = r1
    79. a3.sinks = k1
    80. a3.channels = c2
    81. # Describe/configure the source
    82. a3.sources.r1.type = avro
    83. a3.sources.r1.bind = hadoop1
    84. a3.sources.r1.port = 4142
    85. # Describe the sink
    86. a3.sinks.k1.type = file_roll
    87. a3.sinks.k1.sink.directory = /opt/egg/apache-flume-1.7.0-bin/group1
    88. # Describe the channel
    89. a3.channels.c2.type = memory
    90. a3.channels.c2.capacity = 1000
    91. a3.channels.c2.transactionCapacity = 100
    92. # Bind the source and sink to the channel
    93. a3.sources.r1.channels = c2
    94. a3.sinks.k1.channel = c2
    95. 启动flume
    96. bin/flume-ng agent --conf conf/ --name a3 --conf-file job/group1/flume-flume-dir.conf
    97. bin/flume-ng agent --conf conf/ --name a2 --conf-file job/group1/flume-flume-hdfs.conf
    98. bin/flume-ng agent --conf conf/ --name a1 --conf-file job/group1/flume-file-flume.conf
    99. 制造一些hive日志
    100. sbin/start-dfs.sh
    101. sbin/start-yarn.sh
    102. bin/hive