awk で apache のアクセスログを解析してみる修行

動的ファイルへのアクセスを拾って、接続元の IP アドレスごとのファイルに振り分けてみたり。

BEGIN {
  except_ext["gif"] = 1;
  except_ext["jpg"] = 1;
  except_ext["swf"] = 1;
  except_ext["html"] = 1;
  except_ext["css"] = 1;
  except_ext["js"] = 1;
  except_ext["ico"] = 1;
  except_ext["/"] = 1;
  except_ext["txt"] = 1;
  except_ext["csv"] = 1;

  except_ip[""] = 1;

  month["Jan"] = "01";
  month["Feb"] = "02";
  month["Mar"] = "03";
  month["Apr"] = "04";
  month["May"] = "05";
  month["Jun"] = "06";
  month["Jul"] = "07";
  month["Aug"] = "08";
  month["Sep"] = "09";
  month["Oct"] = "10";
  month["Nov"] = "11";
  month["Dec"] = "12";
}
{
  ip = $1;
  xx = $2;
  xy = $3;
  tm = $4;
  tz = $5;
  method = $6;
  url = $7;
  version = $8;
  retcode = $9;
  split(tm, a, /[\[:\/]/ );
  day =  a[4] month[a[3]] a[2];
  time = a[5] a[6] a[7];

  path = url;
  gsub(/\?(.*)$/, "", path);
  if ( index( url, "?" ) > 0 ) {
    query = url;
    gsub(/^([^\?]+\?)/, "", query);
  }
  else {
    query = "";
  }
  ext = gensub( "^(.*)[\.]([^\.]+)$", "\\2", "", path );
}

except_ext[ext] > 0 {
  next;
}

{
  print day, time, ip, retcode, path, query, "{" ext "}" except_[ext] > prefix "_" ip;
}