..
Pig Implementation
TODO
- Complex Datatypes
- UDF
- Pig Storage
- Piggy bank
- New Functions
- Pig Scripts
- Innovative Queries
Basic Queries
Loading Data
tips_json_parsing = LOAD '/user/hduser/yelp/tip.json' USING JsonLoader('user_id:chararray, business_id:chararray, text: chararray, data: chararray, compliment_count:int');
Number of comments by each user
group_user = GROUP tips_json_parsing BY user_id;
tmp = FOREACH group_user GENERATE group, COUNT(tips_json_parsing);
dump tmp;
User with the most comments
tmp2 = ORDER tmp by $1
User with most compliments
group_user = GROUP tips_json_parsing BY user_id;
tmp = FOREACH group_user GENERATE (tips_json_parsing.compliment_count, SUM(tips_json_parsing.compliment_count));
tmp2 = ORDER tmp BY $0;
dump tmp2;
Restaurant with the most compliments
group_user = GROUP tips_json_parsing BY business_id;
tmp = GROUP tips_json_parsing BY business_id;
tmp2 = FOREACH tmp GENERATE group, COUNT(tips_json_parsing.compliment_count);
tmp2 = ORDER tmp2 by $1;
dump tmp2
Complex Queries
Loading the dataset
business_json = LOAD '/user/hduser/yelp/business.json' USING JsonLoader('business_id:chararray, name:chararray, address:chararray, city:chararray, state:chararray, postal_code:chararray, latitude:float, longitude:float, stars:float, review_count:int, is_open:int');
Correlation Coefficient of review_count and stars
review_and_stars = FOREACH business_json GENERATE (double)stars, (double)review_count;
rel = GROUP review_and_stars BY ALL;
corop = FOREACH rel GENERATE COR(review_and_stars.stars, review_and_stars.review_count);
Which year produced the most number of elite users
Loading the dataset
user_json = LOAD '/user/hduser/yelp/user.json' USING JsonLoader('user_id:chararray, name:chararray, review_count:int, yelping_since:chararray, useful:int, funny:int, cool:int, elite:chararray, friends:chararray, fans:int, average_stars:float, compliment_hot:int, compliment_more:int, compliment_profile:int, compliment_cute:int, compliment_list:int, compliment_note:int, compliment_plain:int, compliment_cool:int, compliment_funny:int, compliment_writer:int, compliment_photos:int');
year_alone = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4);
nr_elite_years = FOREACH user_json GENERATE SIZE(STRSPLIT(elite, ','));
year_and_nr_elite = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4), SIZE(STRSPLIT(elite, ','));
grouped_data = GROUP year_and_nr_elite BY year;
result = foreach grouped_data GENERATE group, SUM(year_and_nr_elite.nr_elite);
sorted_result = ORDER result by $0;
Most popular user
user_and_nr_friends = FOREACH user_json GENERATE name, SIZE(STRSPLIT(friends, ','));
result = order user_and_nr_friends BY $1;
Most visited restaurant
Loading the dataset
checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');
nr_visited = FOREACH checkin_json GENERATE business_id, SIZE(STRSPLIT(date, ','));
business_id_and_name = FOREACH business_json generate business_id, name;
result = join most_visited by business_id, business_id_and_name by business_id;
result = order result by $1;
UDF
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class GroupDatesByYear extends EvalFunc<String> {
public String convertMapToString(Map<String, Integer> map) {
StringBuilder mapAsString = new StringBuilder("{");
for (String key : map.keySet()) {
mapAsString.append(key + "=" + map.get(key).toString() + ", ");
}
mapAsString.delete(mapAsString.length() - 2, mapAsString.length()).append("}");
return mapAsString.toString();
}
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
String str = (String) input.get(0);
String[] timestamps = str.split(",");
Map<String, Integer> hm = new HashMap<String, Integer>();
for (String timestamp : timestamps) {
String year = timestamp.strip().split(" ")[0].split("-")[0];
Integer tmp = hm.get(year);
if (tmp == null) {
hm.put(year, 1);
} else {
hm.put(year, tmp + 1);
}
}
return convertMapToString(hm);
}
}
REGISTER './udf.jar';
checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');
result = FOREACH checkin_json GENERATE GroupDatesByYear(date);
illustrate;