Wednesday, May 29, 2013

Transform to VW format

import sys
import string

for line in sys.stdin:
    line = line.strip()
    toks = line.split('\t')
    pline =  toks[0].strip() + " | "
    #continuous
#    for i in range(1, sys.argv[3]):
    for i in range(1, 2):
        if len(toks[i].strip()) == 0 :
            continue
        pline = pline + str(i) + ":" + toks[i].strip() + str('\t')
    #categorical
    for i in range(2, len(toks)):
        if len(toks[i].strip()) == 0 :
            continue
        pline = pline + str(i) + "cell" + toks[i].strip() + ":1" + str('\t')
    print pline
 
hadoop  jar hadoop-streaming.jar \
-input $1 \
-output $2 \
-mapper "python csv2vm2.py" \
-reducer NONE \
-file csv2vm2.py \
-jobconf mapred.reduce.tasks=5 \
-jobconf mapred.job.queue.name=***;

./csv2vm.sh /train1/* /vwtrain1/
./csv2vm.sh /test1/* /vwtest1/



No comments:

Post a Comment