My DS Coding Bolg: Try Out Hadoop Streaming with Shell awk

Friday, June 7, 2013

Try Out Hadoop Streaming with Shell awk

--- Remove the whole directory

hadoop fs -rmr /user/*****/tempoutput

-- Run the script in the allocated cluster.
The results (output files) are written to the dfs directory tempoutput.

hadoop jar
$HADOOP/hadoop-streaming.jar -Dmapred.job.queue.name=unfunded
-mapper "awk '{if(length(\$0) > 50){print \$0}}'" -reducer
NONE -input linux.words -output tempoutput

-- Wrap with shell script: mapper

$ cat mymapper1.sh

#!/bin/sh

awk
'{if(length($0) > 50){print $0}}'



yarn jar
$HADOOP/hadoop-streaming.jar \

-Dmapred.job.queue.name=unfunded
\

-mapper
mymapper1.sh \

-reducer
NONE \

-input
linux.words \

-output
tempoutput \

-file
mymapper1.sh




-- Wrap with shell script: mapper &
reducer

$
cat mymapper.sh 

#!/bin/sh


awk
'{

if(length($0) gt 4 ) {print substr($0, 0, 4)" "$0 } 

if(length($0)
gt 5 ) {print substr($0, 0, 5)" "$0} 

if(length($0) gt 6 ) {print
substr($0, 0, 6)" "$0}

}'




$
cat myreducer.sh 

#!/bin/sh


awk
'{

curkey=$1; 

curvalue=$2; 

if(prevkey == curkey){

  count+=1; 

  mylist=mylist","curvalue;

} 

else{

  if(count lt 3) {print prevkey"
"mylist; } 

  count = 0; 

  mylist=curkey

} 

  prevkey=curkey

}'






hadoop jar $HADOOP/hadoop-streaming.jar
\

-Dmapred.job.queue.name=***********
\

-input
linux.words \

-output
tempoutput2 \

-mapper mymapper.sh
\

-reducer myreducer.sh
\

-file
mymapper.sh \

-file
myreducer.sh

My DS Coding Bolg

Friday, June 7, 2013

Try Out Hadoop Streaming with Shell awk

No comments:

Post a Comment

Blog Archive