【大数据】Hadoop配置(文档版本)

发布时间 2023-10-07 12:26:28作者: PythonNew_Mr.Wang

主机配置: 192.168.88.101 test1 192.168.88.102 test2 192.168.88.103 test3



Hadoop 上传压缩包并且解压


hadoop百度云:链接:https://pan.baidu.com/s/1DRV_x7Q_ZTUO4KMkr2-6Qg?pwd=not3 

# 先配置NameNode主机:

# 上传 hadoop文件压缩包 到NameNode主机,并且解压到/export/server/ 

[test@test1 ~]$ su -   				# 切换至ROOT用户
[root@test1 ~]# rz                   # 上传hadoop文件包
[root@test1 ~]# tar -zxvf hadoop-3.3.4.tar.gz -C /export/server  # 解压  
[root@test1 ~]# cd /export/server/   # 进入配置文件夹



HDFS 四个文件配置


# (1): 配置workers文件
[root@test1 hadoop]# cd hadoop/etc/hadoop/   
[root@test1 hadoop]# vim workers 		     
test1
test2
test3  


# (2): 配置hadoop-env.sh文件
[root@test1 hadoop]# vim hadoop-env.sh
export JAVA_HOME=/export/server/jdk        		   # 配置了 Java 的安装目录
export HADOOP_HOME=/export/server/hadoop   		   # 配置了 Hadoop 的安装目录
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop     # 配置了 Hadoop 的配置文件所在目录。
export HADOOP_LOG_DIR=$HADOOP_HOME/logs            # 配置了 Hadoop 日志文件的存放目录。



# (3): 配置core-site.xml文件   test1 -> 自定义
[root@test1 hadoop]# vim core-site.xml
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://test1:8020</value>  
  </property>
  <property>
    <name>io.file.buffer.size</name>
    <value>131072</value>            
  </property>
</configuration>



# (4): 配置hdfs-site.xml文件   test1,test2,test3 -> 自定义
[root@test1 hadoop]# vim hdfs-site.xml
<configuration>
  <property>
    <name>dfs.datanode.data.dir.perm</name>
    <value>700</value> 
  </property>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>/data/nn</value> 
  </property>
  <property>
    <name>dfs.namenode.hosts</name>
    <value>test1,test2,test3</value> 
  </property>
  <property>
    <name>dfs.blocksize</name>
    <value>268435456</value>  
  </property>
  <property>
    <name>dfs.namenode.handler.count</name>
    <value>100</value>       
  </property>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>/data/dn</value> 
  </property>
</configuration>



HDFS 配置文件分发节点


# 将配置好的hadoop分发 到 每个主机上

[root@test1 server]# scp -r /export/server/hadoop-3.3.4 test2:/export/server/
[root@test1 server]# scp -r /export/server/hadoop-3.3.4 test3:/export/server/

...



HDFS 数据存储文件夹


# NameNode主机添加 /data/dn /data/nn 
# DataNode主机创建 /data/dn

[root@test1 hadoop]# mkdir -p /data/nn       # test1
[root@test1 hadoop]# mkdir -p /data/dn       # test1

[root@test2 hadoop]# mkdir -p /data/dn       # test2
[root@test3 hadoop]# mkdir -p /data/dn       # test3
...



Hadoop 软链接


# 所有主机 创建 hadoop软连接    - ll查看是否创建

[root@test1 server]# ln -s /export/server/hadoop-3.3.4 hadoop
[root@test2 server]# ln -s /export/server/hadoop-3.3.4 hadoop
...



Hadoop 变量环境


# 所有主机 添加 hadoop 变量环境 并且激活

[root@test1 server]# vim /etc/profile
export HADOOP_HOME=/export/server/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

[root@test1 server]# source /etc/profile
...



Hadoop 授权用户


# 所有主机 相关文件夹授权给hadoop用户

[root@test1 server]# chown -R hadoop:hadoop /data
[root@test1 server]# chown -R hadoop:hadoop /export
...



HDFS 启动


# NameNode主机-初始化系统-开启HDFS集群

[root@test1 server]# su - hadoop
[root@test1 server]# cd /export/server/hadoop/etc/hadoop/    
[hadoop@test1 ~]$ hadoop namenode -format   				# 格式化NameNode
[hadoop@test1 ~]$ start-dfs.sh             				    # 启动全部hdfs集群 
Starting namenodes on [test1]
Starting datanodes
Starting secondary namenodes [test1]
[hadoop@test1 server]$ jps
19824 DataNode
20118 SecondaryNameNode
20279 Jps
19694 NameNode

# WEB管理地址      点击 Live Nodes 查看集群台数
http://192.168.88.101:9870

#单独控制:
$HADOOP_HOME/bin/hadoop-daemon.sh (start | status | stop) ( namenode | secondarynamenode | datanode)



MapReduce 文件配置


# (1) 配置 mapred-env.sh 
[hadoop@test1 server]$ cd /export/server/hadoop/etc/hadoop/
[hadoop@test1 hadoop]$ vim mapred-env.sh 

export JAVA_HOME=/export/server/jdk           
export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=1000    # JobHistoryServer进程内存为1G
export HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA        # 日志级别为INFO


# (2) 配置 mapred-site.xml   test1-自定义
[hadoop@test1 hadoop]$ vim mapred-site.xml 

<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
    <description></description>
  </property>
  <property>
    <name>mapreduce.jobhistory.address</name>
    <value>test1:10020</value>
    <description></description>
  </property>
  <property>
    <name>mapreduce.jobhistory.webapp.address</name>
    <value>test1:19888</value>
    <description></description>
  </property>
  <property>
    <name>mapreduce.jobhistory.intermediate-done-dir</name>
    <value>/data/mr-history/tmp</value>
    <description></description>
  </property>
  <property>
    <name>mapreduce.jobhistory.done-dir</name>
    <value>/data/mr-history/done</value>
    <description></description>
  </property>
<property>
  <name>yarn.app.mapreduce.am.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
  <name>mapreduce.map.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
  <name>mapreduce.reduce.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
</configuration>



YARN 文件配置


# (1) 配置yarn-env.sh
[hadoop@test1 hadoop]$ vim yarn-env.sh

export JAVA_HOME=/export/server/jdk
export HADOOP_HOME=/export/server/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_LOG_DIR=$HADOOP_HOME/logs

# (2) 配置vim yarn-site.xml    test1-自定义
[hadoop@test1 hadoop]$ vim yarn-site.xml 

<configuration>
    <property>
        <name>yarn.log.server.url</name>
        <value>http://test1:19888/jobhistory/logs</value>
        <description></description>
    </property>
  	<property>
    	<name>yarn.web-proxy.address</name>
        <value>test1:8089</value>
        <description>proxy server hostname and port</description>
  	</property>
  	<property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
        <description>Configuration to enable or disable log aggregation</description>
  	</property>
    <property>
        <name>yarn.nodemanager.remote-app-log-dir</name>
        <value>/tmp/logs</value>
        <description>Configuration to enable or disable log aggregation</description>
    </property>
    <!-- Site specific YARN configuration properties -->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>test1</value>
        <description></description>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
        <description></description>
    </property>
    <property>
        <name>yarn.nodemanager.local-dirs</name>
        <value>/data/nm-local</value>
        <description>Comma-separated list of paths on the local filesystem where intermediate data is written.</description>
    </property>
    <property>
        <name>yarn.nodemanager.log-dirs</name>
        <value>/data/nm-log</value>
        <description>Comma-separated list of paths on the local filesystem where logs are written.</description>
    </property>
    <property>
        <name>yarn.nodemanager.log.retain-seconds</name>
        <value>10800</value>
        <description>Default time (in seconds) to retain log files on the NodeManager Only applicable if log-aggregation is disabled.</description>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
        <description>Shuffle service that needs to be set for Map Reduce applications.</description>
    </property>
</configuration>



YARN&MapReduce 配置文件分发节点


# 将MapReduce配置文件与yarn配置文件分发到其他节点

# test2:
scp /export/server/hadoop/etc/hadoop/mapred-env.sh test2:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/mapred-site.xml test2:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/yarn-env.sh test2:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/yarn-site.xml test2:/export/server/hadoop/etc/hadoop/


# test3:
scp /export/server/hadoop/etc/hadoop/mapred-env.sh test3:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/mapred-site.xml test3:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/yarn-env.sh test3:/export/server/hadoop/etc/hadoop/
scp /export/server/hadoop/etc/hadoop/yarn-site.xml test3:/export/server/hadoop/etc/hadoop/


...分发N台



YARN 启动


# 启动YARN    MapReduce不需要启动
[hadoop@test1 hadoop]$ $HADOOP_HOME/sbin/start-yarn.sh  # 停止stop
Starting resourcemanager
Starting nodemanagers
[hadoop@test1 hadoop]$ jps
25338 ResourceManager
25456 NodeManager
25668 WebAppProxyServer
10481 NameNode
10897 SecondaryNameNode
10610 DataNode
25916 Jps


# 启动历史服务器 HADOOP_HOME应该是需要跟环境变量里面的一样
[hadoop@test1 bin]$ $HADOOP_HOME/sbin/mapred --daemon start historyserver # 停止换成stop
[hadoop@test1 bin]$ jps
25456 NodeManager
10481 NameNode
10897 SecondaryNameNode
10610 DataNode
25668 WebAppProxyServer
26230 Jps
25338 ResourceManager
26175 JobHistoryServer

# WEB UI控制台地址   点击nodes查看节点
http://192.168.88.101:8088 

# 单独控制:
$HADOOP_HOME/bin/yarn --daemon (start|stop) (resourcemanager|nodemanager|proxyserver)
$HADOOP_HOME/bin/mapred --daemon (start|stop) historyserver