From 92577d2272899645361d003ae4a4f61c536a051f Mon Sep 17 00:00:00 2001 From: Sriram Madapusi Vasudevan Date: Thu, 11 Jun 2015 14:42:35 -0400 Subject: [PATCH] feat: add log delivery pig script - The hadoop script will allow split up the provider's logs that are piped into it, based on those domains that have log delivery enabled. - README.rst contains instructions on how the script is meant to be used. Implements: blueprint log-delivery Change-Id: I4434175bead26e9b78a3115038af55b25a62163c --- hadoop/README.rst | 22 ++++++++++++++++++++++ hadoop/log_delivery.pig | 11 +++++++++++ 2 files changed, 33 insertions(+) create mode 100644 hadoop/README.rst create mode 100644 hadoop/log_delivery.pig diff --git a/hadoop/README.rst b/hadoop/README.rst new file mode 100644 index 00000000..3fc9ca73 --- /dev/null +++ b/hadoop/README.rst @@ -0,0 +1,22 @@ +Log Delivery +============ + +The pig script needs to be run in a hadoop cluster, after piping all the required logs from a provider with whom services are set up with. + +NOTE: + * All the domains that need to have logs delivered need to copied into the Hadoop Cluster, under the name `domains_log.tsv` + * The corresponding Provider URL needs to be also set + +How to run a Pig Script +======================= + + $ pig -p INPUT=~/log_source -p OUTPUT=~/logs_output -p PROVIDER_URL_EXT=mycdn + + +Output +====== + +There should be directories created under OUTPUT, with each directory corresponding to a domain that had log delivered enabled, and log files underneath each of those directories pertaining to that domain. + + $ logs_output/mydomain/mydomain-0000.gz + $ logs_output/yourdomain/yourdomain-0000.gz diff --git a/hadoop/log_delivery.pig b/hadoop/log_delivery.pig new file mode 100644 index 00000000..b9d2901b --- /dev/null +++ b/hadoop/log_delivery.pig @@ -0,0 +1,11 @@ +REGISTER /usr/lib/pig/piggybank.jar; + +logs = LOAD '$INPUT/*.gz' USING PigStorage('\t') AS (date, time, ip, method, uri, status, bytes:long, time_taken, referer, user_agent, cookie, country); + +log_domains = LOAD '$INPUT/domains_log.tsv' USING PigStorage('\n') AS domains; + +formatted_logs = FOREACH logs GENERATE ip, '-', '-', org.apache.pig.builtin.StringConcat('[',date,':',time, ' +0000',']') , org.apache.pig.builtin.StringConcat('"', method,' ', uri,' ','HTTP/1.1', '"'), status, bytes, referer, user_agent, REGEX_EXTRACT(uri, '/([^/]*).$PROVIDER_URL_EXT(/.*)', 1) AS domain; + +delivery_enabled_formamatted_logs = JOIN log_domains BY domains, formatted_logs BY domain; + +STORE delivery_enabled_formamatted_logs INTO '$OUTPUT' USING org.apache.pig.piggybank.storage.MultiStorage('$OUTPUT', 10, 'gz', '\\t');