Friday, June 7, 2013

Try Out Hadoop Streaming with Shell awk

--- Remove the whole directory
hadoop fs -rmr /user/*****/tempoutput


-- Run the script in the allocated cluster. The results (output files) are written to the dfs directory tempoutput.

hadoop jar $HADOOP/hadoop-streaming.jar -Dmapred.job.queue.name=unfunded -mapper "awk '{if(length(\$0) > 50){print \$0}}'" -reducer NONE -input linux.words -output tempoutput



-- Wrap with shell script: mapper

$ cat mymapper1.sh

#!/bin/sh
awk '{if(length($0) > 50){print $0}}'
yarn jar $HADOOP/hadoop-streaming.jar \
-Dmapred.job.queue.name=unfunded \
-mapper mymapper1.sh \
-reducer NONE \
-input linux.words \
-output tempoutput \
-file mymapper1.sh

-- Wrap with shell script: mapper & reducer
$ cat mymapper.sh
#!/bin/sh
awk '{
if(length($0) gt 4 ) {print substr($0, 0, 4)" "$0 } 
if(length($0) gt 5 ) {print substr($0, 0, 5)" "$0} 
if(length($0) gt 6 ) {print substr($0, 0, 6)" "$0}
}'

$ cat myreducer.sh
#!/bin/sh
awk '{
curkey=$1; 
curvalue=$2; 
if(prevkey == curkey){
  count+=1; 
  mylist=mylist","curvalue;

else{
  if(count lt 3) {print prevkey" "mylist; } 
  count = 0; 
  mylist=curkey

  prevkey=curkey
}'

hadoop jar $HADOOP/hadoop-streaming.jar \
-Dmapred.job.queue.name=*********** \
-input linux.words \
-output tempoutput2 \
-mapper mymapper.sh \
-reducer myreducer.sh \
-file mymapper.sh \
-file myreducer.sh
 

No comments:

Post a Comment