forked from sammyyu/cloudera-ec2-fork
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhadoop-ec2-init-remote.sh
executable file
·821 lines (760 loc) · 24.2 KB
/
hadoop-ec2-init-remote.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
#!/bin/bash -x
#
# Modified version of hadoop-ec2-init-remote.sh, customized to install
# Cloudera Desktop.
#
#
################################################################################
# Script that is run on each EC2 instance on boot. It is passed in the EC2 user
# data, so should not exceed 16K in size after gzip compression.
#
# This script is executed by /etc/init.d/ec2-run-user-data, and output is
# logged to /var/log/messages.
################################################################################
################################################################################
# Initialize variables
################################################################################
# Substitute environment variables passed by the client
export %ENV%
if [ -z "$MASTER_HOST" ]; then
IS_MASTER=true
MASTER_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`
else
IS_MASTER=false
fi
# Force versions
REPO="testing"
HADOOP="hadoop-0.20"
function register_auto_shutdown() {
if [ ! -z "$AUTO_SHUTDOWN" ]; then
shutdown -h +$AUTO_SHUTDOWN >/dev/null &
fi
}
function update_repo() {
if which dpkg &> /dev/null; then
cat > /etc/apt/sources.list.d/cloudera.list <<EOF
deb http://archive.cloudera.com/debian intrepid-$REPO contrib
deb-src http://archive.cloudera.com/debian intrepid-$REPO contrib
EOF
sudo apt-get update
elif which rpm &> /dev/null; then
rm -f /etc/yum.repos.d/cloudera.repo
cat > /etc/yum.repos.d/cloudera-$REPO.repo <<EOF
[cloudera-$REPO]
name=Cloudera's Distribution for Hadoop ($REPO)
baseurl=http://archive.cloudera.com/redhat/cdh/$REPO/
gpgkey = http://archive.cloudera.com/redhat/cdh/RPM-GPG-KEY-cloudera
gpgcheck = 0
EOF
yum update -y yum
fi
}
# Install a list of packages on debian or redhat as appropriate
function install_packages() {
if which dpkg &> /dev/null; then
apt-get update
apt-get -y install $@
elif which rpm &> /dev/null; then
yum install -y $@
else
echo "No package manager found."
fi
}
# Install any user packages specified in the USER_PACKAGES environment variable
function install_user_packages() {
if [ ! -z "$USER_PACKAGES" ]; then
install_packages $USER_PACKAGES
fi
}
# Install Hadoop packages and dependencies
function install_hadoop() {
if which dpkg &> /dev/null; then
apt-get update
apt-get -y install $HADOOP
cp -r /etc/$HADOOP/conf.empty /etc/$HADOOP/conf.dist
update-alternatives --install /etc/$HADOOP/conf $HADOOP-conf /etc/$HADOOP/conf.dist 90
apt-get -y install pig${PIG_VERSION:+-${PIG_VERSION}}
apt-get -y install hadoop-pig${PIG_VERSION:+-${PIG_VERSION}}
apt-get -y install hive${HIVE_VERSION:+-${HIVE_VERSION}}
apt-get -y install policykit # http://www.bergek.com/2008/11/24/ubuntu-810-libpolkit-error/
elif which rpm &> /dev/null; then
yum install -y $HADOOP
cp -r /etc/$HADOOP/conf.empty /etc/$HADOOP/conf.dist
if [ ! -e /etc/alternatives/$HADOOP-conf ]; then # CDH1 RPMs use a different alternatives name
conf_alternatives_name=hadoop
else
conf_alternatives_name=$HADOOP-conf
fi
alternatives --install /etc/$HADOOP/conf $conf_alternatives_name /etc/$HADOOP/conf.dist 90
yum install -y hadoop-pig${PIG_VERSION:+-${PIG_VERSION}}
yum install -y hadoop-hive${HIVE_VERSION:+-${HIVE_VERSION}}
fi
}
function prep_disk() {
mount=$1
device=$2
automount=${3:-false}
echo "warning: ERASING CONTENTS OF $device"
mkfs.xfs -f $device
if [ ! -e $mount ]; then
mkdir $mount
fi
mount -o defaults,noatime $device $mount
if $automount ; then
echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab
fi
}
function wait_for_mount {
mount=$1
device=$2
mkdir $mount
i=1
echo "Attempting to mount $device"
while true ; do
sleep 10
echo -n "$i "
i=$[$i+1]
mount -o defaults,noatime $device $mount || continue
echo " Mounted."
if $automount ; then
echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab
fi
break;
done
}
function make_hadoop_dirs {
for mount in "$@"; do
if [ ! -e $mount/hadoop ]; then
mkdir -p $mount/hadoop
chown hadoop:hadoop $mount/hadoop
fi
done
}
# Configure Hadoop by setting up disks and site file
function configure_hadoop() {
INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type`
install_packages xfsprogs # needed for XFS
# Mount home volume, if any, and strip it from the EBS_MAPPINGS
mount_home_volume
if [ -n "$EBS_MAPPINGS" ]; then
# If there are EBS volumes, use them for persistent HDFS
scaffold_ebs_hdfs
else
# Otherwise, make a blank HDFS on the local drives
scaffold_local_hdfs
fi
# Set up all the instance-local directories
scaffold_hadoop_dirs
# Populate the various config files
create_hadoop_conf
}
# Look for a mount that must be named "/mnt/home" (defined in
# ec2-storage-YOURCLUSTER.json).
function mount_home_volume {
if [[ $EBS_MAPPINGS =~ '/mnt/home,' ]] ; then
# Extract and strip the mapping from the EBS_MAPPINGS
mapping=`echo $EBS_MAPPINGS | sed 's|.*\(/mnt/home,[^;]*\);*.*|\1|'`
EBS_MAPPINGS=`echo $EBS_MAPPINGS | sed 's|/mnt/home,[^;]*;*||'`
echo "Mounting $mapping but not using it for HDFS"
mount=${mapping%,*}
device=${mapping#*,}
wait_for_mount $mount $device
fi
}
function scaffold_ebs_hdfs {
# EBS_MAPPINGS is like "/ebs1,/dev/sdj;/ebs2,/dev/sdk"
DFS_NAME_DIR=''
FS_CHECKPOINT_DIR=''
DFS_DATA_DIR=''
for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do
# Split on the comma (see "Parameter Expansion" in the bash man page)
mount=${mapping%,*}
device=${mapping#*,}
wait_for_mount $mount $device
DFS_NAME_DIR=${DFS_NAME_DIR},"$mount/hadoop/hdfs/name"
FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR},"$mount/hadoop/hdfs/secondary"
DFS_DATA_DIR=${DFS_DATA_DIR},"$mount/hadoop/hdfs/data"
FIRST_MOUNT=${FIRST_MOUNT-$mount}
make_hadoop_dirs $mount
done
# Remove leading commas
DFS_NAME_DIR=${DFS_NAME_DIR#?}
FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR#?}
DFS_DATA_DIR=${DFS_DATA_DIR#?}
DFS_REPLICATION=3 # EBS is internally replicated, but we also use HDFS replication for safety
}
function scaffold_local_hdfs {
case $INSTANCE_TYPE in
m1.xlarge|c1.xlarge)
DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name
FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary
DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data,/mnt3/hadoop/hdfs/data,/mnt4/hadoop/hdfs/data
;;
m1.large)
DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name
FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary
DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data
;;
*)
# "m1.small" or "c1.medium"
DFS_NAME_DIR=/mnt/hadoop/hdfs/name
FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary
DFS_DATA_DIR=/mnt/hadoop/hdfs/data
;;
esac
FIRST_MOUNT=/mnt
DFS_REPLICATION=3
}
# Common directories, whether the HDFS is instance-local or EBS
# Settings appropriate to instance type: http://aws.amazon.com/ec2/instance-types/
function scaffold_hadoop_dirs {
case $INSTANCE_TYPE in
m1.xlarge|c1.xlarge)
# 15GB 4core x 2 64bit (m1.xlarge) $0.80/hr
# 7GB 8core x 2.5 64bit (c1.xlarge) $0.80/hr
prep_disk /mnt2 /dev/sdc true &
disk2_pid=$!
prep_disk /mnt3 /dev/sdd true &
disk3_pid=$!
prep_disk /mnt4 /dev/sde true &
disk4_pid=$!
wait $disk2_pid $disk3_pid $disk4_pid
MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local,/mnt3/hadoop/mapred/local,/mnt4/hadoop/mapred/local
MAX_MAP_TASKS=8 # 8 orig
MAX_REDUCE_TASKS=4 # 4 orig
CLUSTER_REDUCE_TASKS=10 # 10 orig
CHILD_OPTS=-Xmx680m
CHILD_ULIMIT=1392640
;;
m1.large)
# 7.5GB 2 core x 2 64bit $0.40/hr
prep_disk /mnt2 /dev/sdc true
MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local
MAX_MAP_TASKS=4 # 4 orig
MAX_REDUCE_TASKS=2 # 2 orig
CLUSTER_REDUCE_TASKS=10 # 10 orig
CHILD_OPTS=-Xmx1024m
CHILD_ULIMIT=2097152
;;
c1.medium)
# 1.7GB 2 core x 2.5 32bit $0.20/hr
MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local
MAX_MAP_TASKS=4 # 4 orig
MAX_REDUCE_TASKS=2 # 2 orig
CLUSTER_REDUCE_TASKS=10 # 10 orig
CHILD_OPTS=-Xmx550m
CHILD_ULIMIT=1126400
;;
*)
# "m1.small"
# 1.7GB 1 core x 1 32bit $0.10/hr
MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local
MAX_MAP_TASKS=2 # 2 orig
MAX_REDUCE_TASKS=1 # 1 orig
CLUSTER_REDUCE_TASKS=10 # 10 orig
CHILD_OPTS=-Xmx550m
CHILD_ULIMIT=1126400
;;
esac
make_hadoop_dirs `ls -d /mnt*`
# Create tmp directory
mkdir /mnt/tmp
chmod a+rwxt /mnt/tmp
}
function create_hadoop_conf {
##############################################################################
# Modify this section to customize your Hadoop cluster.
##############################################################################
cat > /etc/$HADOOP/conf.dist/hdfs-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.block.size</name>
<value>134217728</value>
<final>true</final>
</property>
<property>
<name>dfs.data.dir</name>
<value>$DFS_DATA_DIR</value>
<final>true</final>
</property>
<property>
<name>dfs.datanode.du.reserved</name>
<value>1073741824</value>
<final>true</final>
</property>
<property>
<name>dfs.datanode.handler.count</name>
<value>3</value>
<final>true</final>
</property>
<!--property>
<name>dfs.hosts</name>
<value>/etc/$HADOOP/conf.dist/dfs.hosts</value>
<final>true</final>
</property-->
<!--property>
<name>dfs.hosts.exclude</name>
<value>/etc/$HADOOP/conf.dist/dfs.hosts.exclude</value>
<final>true</final>
</property-->
<property>
<name>dfs.name.dir</name>
<value>$DFS_NAME_DIR</value>
<final>true</final>
</property>
<property>
<name>dfs.namenode.handler.count</name>
<value>5</value>
<final>true</final>
</property>
<property>
<name>dfs.permissions</name>
<value>true</value>
<final>true</final>
</property>
<property>
<name>dfs.replication</name>
<value>$DFS_REPLICATION</value>
</property>
<!-- Start Cloudera Desktop -->
<property>
<name>dfs.namenode.plugins</name>
<value>org.apache.hadoop.thriftfs.NamenodePlugin</value>
<description>Comma-separated list of namenode plug-ins to be activated.
</description>
</property>
<property>
<name>dfs.datanode.plugins</name>
<value>org.apache.hadoop.thriftfs.DatanodePlugin</value>
<description>Comma-separated list of datanode plug-ins to be activated.
</description>
</property>
<!-- End Cloudera Desktop -->
</configuration>
EOF
cat > /etc/$HADOOP/conf.dist/core-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.checkpoint.dir</name>
<value>$FS_CHECKPOINT_DIR</value>
<final>true</final>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://$MASTER_HOST:8020/</value>
</property>
<property>
<name>fs.trash.interval</name>
<value>1440</value>
<final>true</final>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/mnt/tmp/hadoop-\${user.name}</value>
<final>true</final>
</property>
<property>
<name>io.file.buffer.size</name>
<value>65536</value>
</property>
<property>
<name>hadoop.rpc.socket.factory.class.default</name>
<value>org.apache.hadoop.net.StandardSocketFactory</value>
<final>true</final>
</property>
<property>
<name>hadoop.rpc.socket.factory.class.ClientProtocol</name>
<value></value>
<final>true</final>
</property>
<property>
<name>hadoop.rpc.socket.factory.class.JobSubmissionProtocol</name>
<value></value>
<final>true</final>
</property>
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec</value>
</property>
<property>
<name>fs.s3.awsAccessKeyId</name>
<value>$AWS_ACCESS_KEY_ID</value>
</property>
<property>
<name>fs.s3.awsSecretAccessKey</name>
<value>$AWS_SECRET_ACCESS_KEY</value>
</property>
<property>
<name>fs.s3n.awsAccessKeyId</name>
<value>$AWS_ACCESS_KEY_ID</value>
</property>
<property>
<name>fs.s3n.awsSecretAccessKey</name>
<value>$AWS_SECRET_ACCESS_KEY</value>
</property>
</configuration>
EOF
cat > /etc/$HADOOP/conf.dist/mapred-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapred.child.java.opts</name>
<value>$CHILD_OPTS</value>
</property>
<property>
<name>mapred.child.ulimit</name>
<value>$CHILD_ULIMIT</value>
<final>true</final>
</property>
<property>
<name>mapred.job.tracker</name>
<value>$MASTER_HOST:8021</value>
</property>
<property>
<name>mapred.job.tracker.handler.count</name>
<value>5</value>
<final>true</final>
</property>
<property>
<name>mapred.local.dir</name>
<value>$MAPRED_LOCAL_DIR</value>
<final>true</final>
</property>
<property>
<name>mapred.map.tasks.speculative.execution</name>
<value>true</value>
</property>
<property>
<name>mapred.reduce.parallel.copies</name>
<value>10</value>
</property>
<property>
<name>mapred.reduce.tasks</name>
<value>$CLUSTER_REDUCE_TASKS</value>
</property>
<property>
<name>mapred.reduce.tasks.speculative.execution</name>
<value>false</value>
</property>
<property>
<name>mapred.submit.replication</name>
<value>10</value>
</property>
<property>
<name>mapred.system.dir</name>
<value>/hadoop/system/mapred</value>
</property>
<property>
<name>mapred.tasktracker.map.tasks.maximum</name>
<value>$MAX_MAP_TASKS</value>
<final>true</final>
</property>
<property>
<name>mapred.tasktracker.reduce.tasks.maximum</name>
<value>$MAX_REDUCE_TASKS</value>
<final>true</final>
</property>
<property>
<name>tasktracker.http.threads</name>
<value>46</value>
<final>true</final>
</property>
<property>
<name>mapred.jobtracker.taskScheduler</name>
<value>org.apache.hadoop.mapred.FairScheduler</value>
</property>
<property>
<name>mapred.fairscheduler.allocation.file</name>
<value>/etc/$HADOOP/conf.dist/fairscheduler.xml</value>
</property>
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.output.compression.type</name>
<value>BLOCK</value>
</property>
<!-- Start Cloudera Desktop -->
<property>
<name>mapred.jobtracker.plugins</name>
<value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value>
<description>Comma-separated list of jobtracker plug-ins to be activated.
</description>
</property>
<!-- End Cloudera Desktop -->
</configuration>
EOF
cat > /etc/$HADOOP/conf.dist/fairscheduler.xml <<EOF
<?xml version="1.0"?>
<allocations>
</allocations>
EOF
cat > /etc/$HADOOP/conf.dist/hadoop-metrics.properties <<EOF
# Exposes /metrics URL endpoint for metrics information.
dfs.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
mapred.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
jvm.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
rpc.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext
EOF
# Keep PID files in a non-temporary directory
sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \
/etc/$HADOOP/conf.dist/hadoop-env.sh
mkdir -p /var/run/hadoop
ln -nfsT /var/run/hadoop /var/run/hadoop-0.20
chown -R hadoop:hadoop /var/run/hadoop
# Set SSH options within the cluster
sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \
/etc/$HADOOP/conf.dist/hadoop-env.sh
# Hadoop logs should be on the /mnt partition
rm -rf /var/log/hadoop /var/log/hadoop
mkdir /mnt/hadoop/logs
ln -nfsT /mnt/hadoop/logs /var/log/hadoop
ln -nfsT /mnt/hadoop/logs /var/log/hadoop-0.20
chown -R hadoop:hadoop /var/log/hadoop /var/log/hadoop-0.20 /mnt/hadoop/logs
}
# Sets up small website on cluster.
# TODO(philip): Add links/documentation.
function setup_web() {
if which dpkg &> /dev/null; then
apt-get -y install thttpd
WWW_BASE=/var/www
elif which rpm &> /dev/null; then
yum install -y thttpd
chkconfig --add thttpd
WWW_BASE=/var/www/thttpd/html
fi
cat > $WWW_BASE/index.html << END
<html>
<head>
<title>Hadoop EC2 Cluster</title>
</head>
<body>
<h1>Hadoop EC2 Cluster</h1>
To browse the cluster you need to have a proxy configured.
Start the proxy with <tt>hadoop-ec2 proxy <cluster_name></tt>,
and point your browser to
<a href="http://cloudera-public.s3.amazonaws.com/ec2/proxy.pac">this Proxy
Auto-Configuration (PAC)</a> file. To manage multiple proxy configurations,
you may wish to use
<a href="https://addons.mozilla.org/en-US/firefox/addon/2464">FoxyProxy</a>.
<ul>
<li><a href="http://$MASTER_HOST:50070/">NameNode</a>
<li><a href="http://$MASTER_HOST:50030/">JobTracker</a>
<li><a href="http://$MASTER_HOST:8088/">Cloudera Desktop</a>
</ul>
</body>
</html>
END
service thttpd start
}
function update_dyndns_address() {
if [ "$DYNDNS_PASS" != "" ] ; then
curl "http://${DYNDNS_USER}:${DYNDNS_PASS}@members.dyndns.org/nic/update?hostname=${DYNDNS_HOST}"
fi
}
function start_hadoop_master() {
if which dpkg &> /dev/null; then
AS_HADOOP="su -s /bin/bash - hadoop -c"
# Format HDFS
[ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP namenode -format"
apt-get -y install $HADOOP-namenode
apt-get -y install $HADOOP-secondarynamenode
apt-get -y install $HADOOP-jobtracker
apt-get -y install $HADOOP-datanode
apt-get -y install $HADOOP-tasktracker
elif which rpm &> /dev/null; then
AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c"
# Format HDFS
[ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP namenode -format"
chkconfig --add $HADOOP-namenode
chkconfig --add $HADOOP-secondarynamenode
chkconfig --add $HADOOP-jobtracker
yum install -y $HADOOP-datanode
yum install -y $HADOOP-tasktracker
chkconfig --add $HADOOP-datanode
chkconfig --add $HADOOP-tasktracker
fi
# Note: use 'service' and not the start-all.sh etc scripts
service $HADOOP-namenode start
service $HADOOP-secondarynamenode start
service $HADOOP-jobtracker start
if [ "$MASTER_IS_DATANODE" == "y" ] ; then service $HADOOP-datanode start ; fi
if [ "$MASTER_IS_TASKTRACKER" == "y" ] ; then service $HADOOP-tasktracker start ; fi
$AS_HADOOP "$HADOOP dfsadmin -safemode wait"
$AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /user"
# The following is questionable, as it allows a user to delete another user
# It's needed to allow users to create their own user directories
$AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /user"
# Create temporary directory for Pig and Hive in HDFS
$AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /tmp"
$AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /tmp"
$AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /user/hive/warehouse"
$AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /user/hive/warehouse"
# Update dyndns for master node
update_dyndns_address
}
function start_hadoop_slave() {
if which dpkg &> /dev/null; then
apt-get -y install $HADOOP-datanode
apt-get -y install $HADOOP-tasktracker
elif which rpm &> /dev/null; then
yum install -y $HADOOP-datanode
yum install -y $HADOOP-tasktracker
chkconfig --add $HADOOP-datanode
chkconfig --add $HADOOP-tasktracker
fi
service $HADOOP-datanode start
service $HADOOP-tasktracker start
}
function install_cloudera_desktop {
if which dpkg &> /dev/null; then
if $IS_MASTER; then
apt-get -y install libxslt1.1 cloudera-desktop cloudera-desktop-plugins
dpkg -i /tmp/cloudera-desktop.deb /tmp/cloudera-desktop-plugins.deb
else
apt-get -y install cloudera-desktop-plugins
dpkg -i /tmp/cloudera-desktop-plugins.deb
fi
elif which rpm &> /dev/null; then
if $IS_MASTER; then
yum install -y python-devel cloudera-desktop cloudera-desktop-plugins
else
yum install -y cloudera-desktop-plugins
fi
fi
}
function configure_cloudera_desktop {
if $IS_MASTER; then
mv /usr/share/cloudera-desktop/conf/cloudera-desktop.ini /usr/share/cloudera-desktop/conf/cloudera-desktop.ini.orig
cat > /usr/share/cloudera-desktop/conf/cloudera-desktop.ini <<EOF
[hadoop]
[[hdfs_clusters]]
[[[default]]]
namenode_host=$MASTER_HOST
[[mapred_clusters]]
[[[default]]]
jobtracker_host=$MASTER_HOST
EOF
fi
}
function start_cloudera_desktop {
/etc/init.d/cloudera-desktop start
}
function install_nfs {
if which dpkg &> /dev/null; then
if $IS_MASTER; then
apt-get -y install nfs-kernel-server
fi
apt-get -y install nfs-common
elif which rpm &> /dev/null; then
echo "!!!! Don't know how to install nfs on RPM yet !!!!"
# if $IS_MASTER; then
# yum install -y
# fi
# yum install nfs-utils nfs-utils-lib portmap system-config-nfs
fi
}
# Sets up an NFS-shared home directory.
#
# The actual files live in /mnt/home on master. You probably want /mnt/home to
# live on an EBS volume, with a line in ec2-storage-YOURCLUSTER.json like
# "master": [ [
# { "device": "/dev/sdh", "mount_point": "/mnt/home", "volume_id": "vol-01234567" }
# ....
# On slaves, home drives are NFS-mounted from master to /mnt/home
function configure_nfs {
if $IS_MASTER; then
grep -q '/mnt/home' /etc/exports || ( echo "/mnt/home *.internal(rw,no_root_squash,no_subtree_check)" >> /etc/exports )
else
# slaves get /mnt/home and /usr/global from master
grep -q '/mnt/home' /etc/fstab || ( echo "$MASTER_HOST:/mnt/home /mnt/home nfs rw 0 0" >> /etc/fstab )
fi
rmdir /home 2>/dev/null
mkdir -p /var/lib/nfs/rpc_pipefs
mkdir -p /mnt/home
ln -nfsT /mnt/home /home
}
function start_nfs {
if $IS_MASTER; then
/etc/init.d/nfs-kernel-server restart
/etc/init.d/nfs-common restart
else
/etc/init.d/nfs-common restart
mount /mnt/home
fi
}
# Follow along with tail -f /var/log/user.log
function configure_devtools {
apt-get -y update ;
apt-get -y upgrade ;
#
apt-get -y install git-core cvs subversion exuberant-ctags tree zip openssl ;
apt-get -y install libpcre3-dev libbz2-dev libonig-dev libidn11-dev libxml2-dev libxslt1-dev libevent-dev;
apt-get -y install emacs emacs-goodies-el emacsen-common ;
apt-get -y install ruby rubygems ruby1.8-dev ruby-elisp irb ri rdoc python-setuptools python-dev;
# Distributed database
apt-get -y install libtokyocabinet-dev tokyocabinet-bin ;
# Java dev
apt-get -y install ant # TODO: ivy
# Python
easy_install simplejson boto ctypedbytes dumbo
# Un-screwup Ruby Gems
gem install --no-rdoc --no-ri rubygems-update --version=1.3.1 ; /var/lib/gems/1.8/bin/update_rubygems; gem update --no-rdoc --no-ri --system ; gem --version ;
GEM_COMMAND="gem install --no-rdoc --no-ri --source=http://gemcutter.org"
# Ruby gems: Basic utility and file format gems
$GEM_COMMAND extlib oniguruma fastercsv json libxml-ruby htmlentities addressable uuidtools
# Ruby gems: Wukong and friends
$GEM_COMMAND wukong monkeyshines edamame wuclan
#
# export CLASSPATH=$( echo `/bin/ls /usr/lib/pig/*.jar /usr/lib/hadoop/*.jar /usr/lib/hadoop/lib/*.jar` | ruby -e 'puts $stdin.read.chomp.gsub(/\s/, ":")' )
# ( cd /usr/lib/pig/contrib ;
# svn co http://svn.apache.org/repos/asf/hadoop/pig/trunk/contrib/piggybank ;
# cd piggybank/java ;
# ant )
}
#
# This is made of kludge. Among other things, you have to create the users in
# the right order -- and ensure none have been made before -- or your uid's
# won't match the ones on the EBS volume.
#
# This also creates and sets permissions on the HDFS home directories, which
# might be best left off. (It depends on the HDFS coming up in time
#
function make_user_accounts {
for newuser in $USER_ACCOUNTS ; do
adduser $newuser --disabled-password --gecos "";
sudo -u hadoop hadoop dfs -mkdir /user/$newuser
sudo -u hadoop hadoop dfs -chown $newuser /user/$newuser
done
}
function cleanup {
apt-get -y autoremove
apt-get -y clean
updatedb
}
install_nfs
configure_nfs
register_auto_shutdown
update_repo
install_user_packages
install_hadoop
install_cloudera_desktop
configure_hadoop
configure_cloudera_desktop
start_nfs
configure_devtools
if $IS_MASTER ; then
setup_web
start_hadoop_master
start_cloudera_desktop
else
start_hadoop_slave
fi
make_user_accounts
cleanup