一、配置网络
虚拟机网络使用NAT
模式,网络配置为静态IP,方便节点间通信
vi /etc/sysconfig/network-scripts/ifcfg-ens33
# 网络类型
TYPE=Ethernet
NAME=ens33
UUID=89e7972a-6f3e-4981-8cde-96cd005dfc46
# 网卡设备
DEVICE=ens33
ONBOOT=yes
BOOTPROTO=static
IPADDR=192.168.133.101
GATEWAY=192.168.133.2
DNS1=192.168.133.2
二、配置hostname和hosts
为搭建Hadoop集群,计划准备3台虚拟机,为方便通信,使用统一的hostname命名规则,本次使用"hadoop+IP地址末位"的形式,hostname命名为"hadoop101"、"hadoop102"、"hadoop103"
修改hostname
vi /etc/sysconfig/network
NETWORKING=yes
NETWORKING_IPV6=no
HOSTNAME=hadoop101
修改hosts文件
vi /etc/hosts
192.168.133.101 hadoop101
192.168.133.102 hadoop102
192.168.133.103 hadoop103
三、关闭防火墙
查看防火墙状态
firewall-cmd --state
停止firewall
systemctl stop firewalld.service
禁止firewall开机启动
systemctl disable firewalld.service
四、克隆虚拟机
配置完成后,克隆虚拟机,修改IP
五、ssh免密登录
先从101通过ssh手动登录102、103
ssh root@hadoop102
ssh root@hadoop102
ssh root@hadoop102
进入.ssh目录
cd ~/.ssh
生成公钥和私钥
ssh-keygen -t rsa
把公钥复制到其它服务器上
ssh-copy-id hadoop102
ssh-copy-id hadoop103
六、服务器时间设置
设置时区为中国标准时间
[root@hadoop101 ~]# timedatectl set-timezone Asia/Shanghai
[root@hadoop101 ~]# timedatectl status
Local time: Mon 2022-04-04 18:21:12 CST
Universal time: Mon 2022-04-04 10:21:12 UTC
RTC time: Mon 2022-04-04 10:21:13
Time zone: Asia/Shanghai (CST, +0800)
NTP enabled: yes
NTP synchronized: yes
RTC in local TZ: no
DST active: n/a
安装ntp,启动ntpd服务,设置ntpd自动启动
注: 只需要在hadoop101上安装和配置ntp,指定向服务器自身进行时钟同步,其它节点通过定时任务定时从101同步时间
yum install ntp -y
systemctl start ntpd
systemctl enable ntpd
修改ntp配置文件 vi /etc/ntp.conf
# 指定接收192.168.133.0网段的客户端的同步请求
restrict 192.168.133.0 mask 255.255.255.0 nomodify notrap
# Use public servers from the pool.ntp.org project.
# Please consider joining the pool (http://www.pool.ntp.org/join.html).
# 注释掉,不使用ntp服务器
#server 0.centos.pool.ntp.org iburst
#server 1.centos.pool.ntp.org iburst
#server 2.centos.pool.ntp.org iburst
#server 3.centos.pool.ntp.org iburst
# 指定向服务器自身进行时钟同步
server 127.127.1.0
fudge 127.127.1.0 stratum 10
七、集群同步文件
rsync
命令可以在服务器间同步数据,例如rsync -rv /path/something root@hadoop102:/path/
,可以将something这个目录或文件同步到hadoop102节点。
编写一个集群同步文件的脚本,实现循环复制文件到所有节点服务器的相同目录。
在/root目录下新建bin目录,在bin目录下新建一个xsync文件,使用chmod +x xsync为其增加执行权限;脚本使用方法:xsync 目录或文件名
。
脚本内容如下:
#!/bin/bash
# 循环复制文件到所有节点服务器的相同目录
# 判断输入参数的个数
pcount=$#
if((pcount==0));then
echo no args;
exit;
fi
# 获取文件名
p1=$1
fname=`basename $p1`
echo fname=$fname
# 获取文件所在路径
pdir=`cd -P $(dirname $p1); pwd`
echo pdir=$pdir
# 当前用户名
user=`whoami`
# 服务器的hostname命名为hadoop101、hadoop102、hadoop103
# 在循环中拼接hostname并使用rsync将同步文件
for((host=102; host<104; host++)); do
echo --------hadoop$host--------
rsync -rvl $pdir/$fname $user@hadoop$host:$pdir
done
八、安装JDK和Hadoop
在hadoop101上安装好jdk1.8
和hadoop2.10.1
,并在/etc/profile下配置好环境变量
export JAVA_HOME=/opt/module/jdk1.8.0_202
export PATH=$PATH:$JAVA_HOME/bin
export HADOOP_HOME=/opt/module/hadoop-2.10.1
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
然后使用xsync脚本同步到其它节点。
xsync /opt/module/jdk1.8.0_202
xsync /opt/module/hadoop-2.10.1
xsync /etc/profile
在各个服务器上执行source /etc/profile
,使配置生效。
九、Hadoop分布式集群部署
- 集群部署规划
hadoop101 | hadoop102 | hadoop103 | |
---|---|---|---|
HDFS | NameNode DataNode |
DataNode | SecondaryNameNode |
YARN | NodeManager | ResourceManager NodeManager |
NodeManager |
- 配置hadoop核心配置文件
vim /opt/module/hadoop-2.10.1/etc/hadoop/core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop101:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/module/hadoop-2.10.1/data/tmp</value>
</property>
</configuration>
- 配置HDFS
vim /opt/module/hadoop-2.10.1/etc/hadoop/hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop103:50090</value>
</property>
</configuration>
- 配置YARN
vim /opt/module/hadoop-2.10.1/etc/hadoop/yarn-site.xml
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop102</value>
</property>
</configuration>
- 配置MapReduce
vim /opt/module/hadoop-2.10.1/etc/hadoop/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</propterty>
</configuration>
-
配置JAVA_HOME,否则找不到报JAVA_HOME,在
hadoop-env.sh
、yarn-env.sh
、mapred-env.sh
这三个脚本中添加export JAVA_HOME=/opt/module/jdk1.8.0_202
-
配置slaves
vim /opt/module/hadoop-2.10.1/etc/hadoop/slaves hadoop101 hadoop102 hadoop103
-
创建数据目录
cd /opt/module/hadoop-2.10.1/ && mkdir data && cd data && mkdir tmp
-
集群分发配置文件
xsync /opt/module/hadoop-2.10.1/
-
初次启动集群,需要执行
hadoop namenode -format
-
启动HDFS
[root@hadoop101 hadoop-2.10.1]# sbin/start-dfs.sh
Starting namenodes on [hadop101]
hadop101: starting namenode, logging to /opt/module/hadoop-2.10.1/logs/hadoop-root-namenode-hadoop101.out
hadoop102: starting datanode, logging to /opt/module/hadoop-2.10.1/logs/hadoop-root-datanode-hadoop102.out
hadoop103: starting datanode, logging to /opt/module/hadoop-2.10.1/logs/hadoop-root-datanode-hadoop103.out
hadoop101: starting datanode, logging to /opt/module/hadoop-2.10.1/logs/hadoop-root-datanode-hadoop101.out
Starting secondary namenodes [hadop103]
hadop103: starting secondarynamenode, logging to /opt/module/hadoop-2.10.1/logs/hadoop-root-secondarynamenode-hadoop101.out
-
启动YARN
去ResourceManager所在的服务器102上启动YARN
[root@hadoop102 hadoop-2.10.1]# sbin/start-yarn.sh starting yarn daemons starting resourcemanager, logging to /opt/module/hadoop-2.10.1/logs/yarn-root-resourcemanager-hadoop102.out hadoop101: starting nodemanager, logging to /opt/module/hadoop-2.10.1/logs/yarn-root-nodemanager-hadoop101.out hadoop103: starting nodemanager, logging to /opt/module/hadoop-2.10.1/logs/yarn-root-nodemanager-hadoop103.out hadoop102: starting nodemanager, logging to /opt/module/hadoop-2.10.1/logs/yarn-root-nodemanager-hadoop102.out [root@hadoop102 hadoop-2.10.1]# jps 14448 ResourceManager 14592 Jps 14552 NodeManager 12956 DataNode
-
访问HDFS
http://hadoop101:50070/dfshealth.html#tab-overview
- 访问YARN ResourceManage
http://hadoop102:8088/cluster
- 运行PI实例,检查集群是否启动成功
[root@hadoop101 mapreduce]# cd /opt/module/hadoop-2.10.1/share/hadoop/mapreduce
[root@hadoop101 mapreduce]# ls
hadoop-mapreduce-client-app-2.10.1.jar hadoop-mapreduce-client-jobclient-2.10.1.jar lib
hadoop-mapreduce-client-common-2.10.1.jar hadoop-mapreduce-client-jobclient-2.10.1-tests.jar lib-examples
hadoop-mapreduce-client-core-2.10.1.jar hadoop-mapreduce-client-shuffle-2.10.1.jar sources
hadoop-mapreduce-client-hs-2.10.1.jar hadoop-mapreduce-examples-2.10.1.jar
hadoop-mapreduce-client-hs-plugins-2.10.1.jar jdiff
[root@hadoop101 mapreduce]# hadoop jar hadoop-mapreduce-examples-2.10.1.jar pi 10 10
Number of Maps = 10
Samples per Map = 10
Wrote input for Map #0
Wrote input for Map #1
Wrote input for Map #2
Wrote input for Map #3
Wrote input for Map #4
Wrote input for Map #5
Wrote input for Map #6
Wrote input for Map #7
Wrote input for Map #8
Wrote input for Map #9
Starting Job
22/04/05 01:22:27 INFO client.RMProxy: Connecting to ResourceManager at hadoop102/192.168.133.102:8032
22/04/05 01:22:28 INFO input.FileInputFormat: Total input files to process : 10
22/04/05 01:22:28 INFO mapreduce.JobSubmitter: number of splits:10
22/04/05 01:22:28 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1649091969708_0002
22/04/05 01:22:28 INFO conf.Configuration: resource-types.xml not found
22/04/05 01:22:28 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
22/04/05 01:22:28 INFO resource.ResourceUtils: Adding resource type - name = memory-mb, units = Mi, type = COUNTABLE
22/04/05 01:22:28 INFO resource.ResourceUtils: Adding resource type - name = vcores, units = , type = COUNTABLE
22/04/05 01:22:29 INFO impl.YarnClientImpl: Submitted application application_1649091969708_0002
22/04/05 01:22:29 INFO mapreduce.Job: The url to track the job: http://hadoop102:8088/proxy/application_1649091969708_0002/
22/04/05 01:22:29 INFO mapreduce.Job: Running job: job_1649091969708_0002
22/04/05 01:22:38 INFO mapreduce.Job: Job job_1649091969708_0002 running in uber mode : false
22/04/05 01:22:38 INFO mapreduce.Job: map 0% reduce 0%
22/04/05 01:22:51 INFO mapreduce.Job: map 20% reduce 0%
22/04/05 01:23:10 INFO mapreduce.Job: map 20% reduce 7%
22/04/05 01:23:21 INFO mapreduce.Job: map 50% reduce 7%
22/04/05 01:23:26 INFO mapreduce.Job: map 70% reduce 7%
22/04/05 01:23:27 INFO mapreduce.Job: map 100% reduce 7%
22/04/05 01:23:28 INFO mapreduce.Job: map 100% reduce 100%
22/04/05 01:23:29 INFO mapreduce.Job: Job job_1649091969708_0002 completed successfully
22/04/05 01:23:29 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=226
FILE: Number of bytes written=2298230
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=2630
HDFS: Number of bytes written=215
HDFS: Number of read operations=43
HDFS: Number of large read operations=0
HDFS: Number of write operations=3
Job Counters
Launched map tasks=10
Launched reduce tasks=1
Data-local map tasks=10
Total time spent by all maps in occupied slots (ms)=373219
Total time spent by all reduces in occupied slots (ms)=34501
Total time spent by all map tasks (ms)=373219
Total time spent by all reduce tasks (ms)=34501
Total vcore-milliseconds taken by all map tasks=373219
Total vcore-milliseconds taken by all reduce tasks=34501
Total megabyte-milliseconds taken by all map tasks=382176256
Total megabyte-milliseconds taken by all reduce tasks=35329024
Map-Reduce Framework
Map input records=10
Map output records=20
Map output bytes=180
Map output materialized bytes=280
Input split bytes=1450
Combine input records=0
Combine output records=0
Reduce input groups=2
Reduce shuffle bytes=280
Reduce input records=20
Reduce output records=0
Spilled Records=40
Shuffled Maps =10
Failed Shuffles=0
Merged Map outputs=10
GC time elapsed (ms)=7721
CPU time spent (ms)=13550
Physical memory (bytes) snapshot=2082910208
Virtual memory (bytes) snapshot=22879621120
Total committed heap usage (bytes)=1383833600
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=1180
File Output Format Counters
Bytes Written=97
Job Finished in 62.362 seconds
Estimated value of Pi is 3.20000000000000000000