--
- Remove the whole directory
hadoop fs -rmr /user/*****/tempoutput
-- Run the script in the allocated cluster.
The results (output files) are written to the dfs directory tempoutput.
hadoop jar
$HADOOP/hadoop-streaming.jar -Dmapred.job.queue.name=unfunded
-mapper "awk '{if(length(\$0) > 50){print \$0}}'" -reducer
NONE -input linux.words -output tempoutput
-- Wrap with shell script: mapper
$ cat mymapper1.sh
#!/bin/sh
awk
'{if(length($0) > 50){print $0}}'
yarn jar
$HADOOP/hadoop-streaming.jar \
-Dmapred.job.queue.name=unfunded
\
-mapper
mymapper1.sh \
-reducer
NONE \
-input
linux.words \
-output
tempoutput \
-file
mymapper1.sh
-- Wrap with shell script: mapper &
reducer
$
cat mymapper.sh
#!/bin/sh
awk
'{
if(length($0) gt 4 ) {print substr($0, 0, 4)" "$0 }
if(length($0)
gt 5 ) {print substr($0, 0, 5)" "$0}
if(length($0) gt 6 ) {print
substr($0, 0, 6)" "$0}
}'
$
cat myreducer.sh
#!/bin/sh
awk
'{
curkey=$1;
curvalue=$2;
if(prevkey == curkey){
count+=1;
mylist=mylist","curvalue;
}
else{
if(count lt 3) {print prevkey"
"mylist; }
count = 0;
mylist=curkey
}
prevkey=curkey
}'
hadoop jar $HADOOP/hadoop-streaming.jar
\
-Dmapred.job.queue.name=***********
\
-input
linux.words \
-output
tempoutput2 \
-mapper mymapper.sh
\
-reducer myreducer.sh
\
-file
mymapper.sh \
-file
myreducer.sh
No comments:
Post a Comment