def adimp1dtoimp1dbucket(adimpfreq1d):
if (adimpfreq1d == 0): adimp1dbucket = 0
elif (adimpfreq1d >= 1) and (adimpfreq1d <= 5): adimp1dbucket = 1
elif (adimpfreq1d >= 6) and (adimpfreq1d <= 20): adimp1dbucket = 2
elif (adimpfreq1d >= 21): adimp1dbucket = 3
else: adimp1dbucket = -1
return (adimp1dbucket)
@outputSchema("adimp3wbucket:int")
def adimp3wtoimp3wbucket(adimpfreq3w):
if (adimpfreq3w == 0): adimp3wbucket = 0
elif (adimpfreq3w >= 1) and (adimpfreq3w <= 12): adimp3wbucket = 1
elif (adimpfreq3w >= 13) and (adimpfreq3w <= 50): adimp3wbucket = 2
elif (adimpfreq3w >= 50): adimp3wbucket = 3
else: adimp3wbucket = -1
return (adimp3wbucket)
--Hello world data sample.
$ cat hello_world.txt
foo
foo
baz
--Sample Pig using Jython.
$ cat hello_world.pig
Register 'hello.py' using jython as hello;
A = load 'hello_world.txt' as (name: chararray);
b = foreach A generate name,hello.helloworld();
store b into '/user/*****/hello_world_output' using P****e();
--Python Hello world File
$ cat hello.py
#!/usr/bin/python
@outputSchema("word:chararray")
def helloworld():
return ('Hello, World')
--Copy data file
hadoop fs -put hello_world.txt
--Run Pig Scripts
ls -la
pig -****job.queue.name=***** hello_world.pig
https://www.codementor.io/data-science/tutorial/extending-hadoop-apache-pig-with-python-udfs
1 Downloaded pig_utils.py from https://svn.apache.org/repos/asf/pig/trunk/src/python/streaming/pig_util.py
2 Created myudf.pig file:
from pig_util import outputSchema
@outputSchema("word:chararray")
def hi_world(self):
return "hello word"
3 Created udf.pig
3 Created udf.pig
REGISTER 'myudf.py' using python as my_special udders
users = LOAD '/kohls/stage/tmp/***/pyudf/user_date' AS (name: char array):
hello_users = FOREACH users GENERATE name, my_special_udfs.hi_world():
dump hello_users
4 Here's user_data
sunil
kumar
avril
lavigne
5 Ran the pig job using: pig -x local udf.pig
Faced one problem, which is passing 'self' parameter to python function. That is not mentioned in tutorial.
5 Ran the pig job using: pig -x local udf.pig
Faced one problem, which is passing 'self' parameter to python function. That is not mentioned in tutorial.
No comments:
Post a Comment