import string
for line in sys.stdin:
line = line.strip()
toks = line.split('\t')
pline = toks[0].strip() + " | "
#continuous
# for i in range(1, sys.argv[3]):
for i in range(1, 2):
if len(toks[i].strip()) == 0 :
continue
pline = pline + str(i) + ":" + toks[i].strip() + str('\t')
#categorical
for i in range(2, len(toks)):
if len(toks[i].strip()) == 0 :
continue
pline = pline + str(i) + "cell" + toks[i].strip() + ":1" + str('\t')
print pline
hadoop jar hadoop-streaming.jar \
-input $1 \
-output $2 \
-mapper "python csv2vm2.py" \
-reducer NONE \
-file csv2vm2.py \
-jobconf mapred.reduce.tasks=5 \
-jobconf mapred.job.queue.name=***;
./csv2vm.sh /train1/* /vwtrain1/
./csv2vm.sh /test1/* /vwtest1/
No comments:
Post a Comment