..

Pig Implementation

TODO

  • Complex Datatypes
  • UDF
  • Pig Storage
  • Piggy bank
  • New Functions
  • Pig Scripts
  • Innovative Queries

Basic Queries

Loading Data

tips_json_parsing = LOAD '/user/hduser/yelp/tip.json' USING JsonLoader('user_id:chararray, business_id:chararray, text: chararray, data: chararray, compliment_count:int');

Number of comments by each user

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE group, COUNT(tips_json_parsing);
dump tmp;

Pasted image 20231031144013

User with the most comments

tmp2 = ORDER tmp by $1

Pasted image 20231031144252

User with most compliments

group_user = GROUP tips_json_parsing  BY user_id;
tmp = FOREACH group_user GENERATE (tips_json_parsing.compliment_count, SUM(tips_json_parsing.compliment_count));
tmp2 = ORDER tmp BY $0;
dump tmp2;

Pasted image 20231031145139

Restaurant with the most compliments

group_user = GROUP tips_json_parsing  BY business_id;
tmp = GROUP tips_json_parsing BY business_id;
tmp2 = FOREACH tmp GENERATE group, COUNT(tips_json_parsing.compliment_count);
tmp2 = ORDER tmp2 by $1;
dump tmp2

Pasted image 20231031150125

Complex Queries

Loading the dataset

business_json = LOAD  '/user/hduser/yelp/business.json' USING JsonLoader('business_id:chararray, name:chararray, address:chararray, city:chararray, state:chararray, postal_code:chararray, latitude:float, longitude:float, stars:float, review_count:int, is_open:int');

Pasted image 20231121101432

Correlation Coefficient of review_count and stars

review_and_stars = FOREACH business_json GENERATE (double)stars, (double)review_count;
rel = GROUP review_and_stars BY ALL;
corop = FOREACH rel GENERATE COR(review_and_stars.stars, review_and_stars.review_count);

Pasted image 20231121101319

Which year produced the most number of elite users

Loading the dataset

user_json = LOAD '/user/hduser/yelp/user.json' USING JsonLoader('user_id:chararray, name:chararray, review_count:int, yelping_since:chararray, useful:int, funny:int, cool:int, elite:chararray, friends:chararray, fans:int,  average_stars:float, compliment_hot:int, compliment_more:int, compliment_profile:int, compliment_cute:int, compliment_list:int, compliment_note:int, compliment_plain:int, compliment_cool:int, compliment_funny:int, compliment_writer:int, compliment_photos:int');

Pasted image 20231121110801

year_alone = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4);
nr_elite_years = FOREACH user_json GENERATE SIZE(STRSPLIT(elite, ','));
year_and_nr_elite = FOREACH user_json GENERATE (int)SUBSTRING(yelping_since, 0, 4), SIZE(STRSPLIT(elite, ','));
grouped_data = GROUP year_and_nr_elite BY year;
result = foreach grouped_data GENERATE group, SUM(year_and_nr_elite.nr_elite);
sorted_result = ORDER result by $0;

Pasted image 20231121111433 Pasted image 20231121111612

user_and_nr_friends = FOREACH user_json GENERATE name, SIZE(STRSPLIT(friends, ','));
result = order user_and_nr_friends BY $1;

Pasted image 20231121112554

Most visited restaurant

Loading the dataset

checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');

Pasted image 20231121112950

nr_visited = FOREACH checkin_json GENERATE business_id, SIZE(STRSPLIT(date, ','));

Pasted image 20231121113116 Pasted image 20231121113054

business_id_and_name = FOREACH business_json generate business_id, name;
result = join most_visited by business_id, business_id_and_name by business_id;
result = order result by $1;

Pasted image 20231121113739

UDF

import java.io.IOException;  
import java.util.HashMap;  
import java.util.Map;  
  
import org.apache.pig.EvalFunc;  
import org.apache.pig.data.Tuple;  
  
public class GroupDatesByYear extends EvalFunc<String> {  
    public String convertMapToString(Map<String, Integer> map) {  
        StringBuilder mapAsString = new StringBuilder("{");  
        for (String key : map.keySet()) {  
            mapAsString.append(key + "=" + map.get(key).toString() + ", ");  
        }  
        mapAsString.delete(mapAsString.length() - 2, mapAsString.length()).append("}");  
        return mapAsString.toString();  
    }  
  
    public String exec(Tuple input) throws IOException {  
        if (input == null || input.size() == 0)  
            return null;  
        String str = (String) input.get(0);  
        String[] timestamps = str.split(",");  
        Map<String, Integer> hm = new HashMap<String, Integer>();  
        for (String timestamp : timestamps) {  
            String year = timestamp.strip().split(" ")[0].split("-")[0];  
            Integer tmp = hm.get(year);  
            if (tmp == null) {  
                hm.put(year, 1);  
            } else {  
                hm.put(year, tmp + 1);  
            }  
        }  
        return convertMapToString(hm);  
    }  
}
REGISTER './udf.jar';
checkin_json = LOAD '/user/hduser/yelp/checkin.json' USING JsonLoader('business_id:chararray, date:chararray');
result = FOREACH checkin_json GENERATE GroupDatesByYear(date);
illustrate;

Pasted image 20231121123849