日子总是过得很快,却近半年时间没有记录一篇博客,开发依然在做着,却少了很多整理的动力,最近开始做恶大规模机器学习相关的东西,需要在EC2上搭建Hadoop集群,做一记录。
local$ sudo chmod 600 ~/.ssh/pem_key_filename
local$ ssh -i ~/.ssh/pem_key_filename
尝试登陆namenode~/.ssh/config
,实现local免密码登录namenodenamenode$ ssh-keygen -f ~/.ssh/id_rsa -t rsa -P ""
namenode$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
cat ~/.ssh/id_rsa.pub | ssh datanode1 'cat >> ~/.ssh/authorized_keys'
cat ~/.ssh/id_rsa.pub | ssh datanode2 'cat >> ~/.ssh/authorized_keys'
cat ~/.ssh/id_rsa.pub | ssh datanode3 'cat >> ~/.ssh/authorized_keys'
allnodes$ sudo apt-get update
allnodes$ sudo apt-get install openjdk-7-jdk
allnodes$ java -version
allnodes$ wget http://apache.mirrors.tds.net/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz -P ~/Downloads
vim ~/.profile
export JAVA_HOME=/usr
export PATH=$PATH:$JAVA_HOME/bin
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
allnodes$ . ~/.profile
allnodes$ sudo vim $HADOOP_CONF_DIR/hadoop-env.sh
export JAVA_HOME=/usr
vim $HADOOP_CONF_DIR/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://`namenode_public_dns`:9000</value>
</property>
</configuration>
vim $HADOOP_CONF_DIR/yarn-site.xml
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>`namenode_public_dns`</value>
</property>
</configuration>
allnodes$ sudo cp $HADOOP_CONF_DIR/mapred-site.xml.template $HADOOP_CONF_DIR/mapred-site.xml
vim $HADOOP_CONF_DIR/mapred-site.xml
<configuration>
<property>
<name>mapreduce.jobtracker.address</name>
<value>`namenode_public_dns`:54311</value>
</property>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
vim /etc/hosts
ip-172-31-35-242
127.0.0.1 localhost
namenode_public_dns namenode_hostname
datanode1_public_dns datanode1_hostname
datanode2_public_dns datanode2_hostname
datanode3_public_dns datanode3_hostname
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
ff02::3 ip6-allhosts
vim $HADOOP_CONF_DIR/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///usr/local/hadoop/hadoop_data/hdfs/namenode</value>
</property>
</configuration>
namenode$ sudo mkdir -p $HADOOP_HOME/hadoop_data/hdfs/namenode
namenode$ sudo touch $HADOOP_CONF_DIR/masters
vim $HADOOP_CONF_DIR/masters
namenode_hostname
vim $HADOOP_CONF_DIR/slaves
datanode1_hostname
datanode2_hostname
datanode3_hostname
namenode$ sudo chown -R ubuntu $HADOOP_HOME
vim $HADOOP_CONF_DIR/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///usr/local/hadoop/hadoop_data/hdfs/datanode</value>
</property>
</configuration>
datanodes$ sudo mkdir -p $HADOOP_HOME/hadoop_data/hdfs/datanode
datanodes$ sudo chown -R ubuntu $HADOOP_HOME
namenode$ hdfs namenode -format
, all the data previous on it will lostnamenode$ $HADOOP_HOME/sbin/start-dfs.sh
在浏览器打开namenode_public_dns:50070
,查看Cluster的状态,应该有3个Live Nodes
YARN
以及MapReduce JobHistory Server
namenode$ $HADOOP_HOME/sbin/start-yarn.sh
namenode$ $HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
jps
查看namenode的状态namenode$ jps
JobHistoryServer
Jps
SecondaryNameNode
ResourceManager
NameNode
jps
查看datanode的状态datanodes$ jps
NodeManager
DataNode
Jps