1 | #!/usr/bin/perl -w |
---|
2 | |
---|
3 | ######################################################################## |
---|
4 | # |
---|
5 | # analyze-traffic.pl |
---|
6 | # |
---|
7 | # Written by Jeffrey M. Vinocur <jeff@litech.org> |
---|
8 | # This work is hereby placed in the public domain by its author. |
---|
9 | # |
---|
10 | ######################################################################## |
---|
11 | # |
---|
12 | # Usage: |
---|
13 | # |
---|
14 | # 1. Add an entry in $pathetc/newsfeeds like the one below, and issue |
---|
15 | # `ctlinnd reload newsfeeds traffic` (you can change the path to |
---|
16 | # whatever you like). |
---|
17 | # |
---|
18 | # analyze!\ |
---|
19 | # :*\ |
---|
20 | # :Tf,WgsbmnN:/usr/local/news/log/traffic |
---|
21 | # |
---|
22 | # You may find it useful to restrict the articles being logged, |
---|
23 | # either by modifing the wildmat pattern, or by using the /exclude |
---|
24 | # notation to indicate articles that have passed through some |
---|
25 | # servers should not be included. |
---|
26 | # |
---|
27 | # Also, if $USE_ALL_GROUPS (see below) is false, you can leave out |
---|
28 | # the N flag, which may eliminate some parsing errors. |
---|
29 | # |
---|
30 | # 2. Wait for some data. |
---|
31 | # |
---|
32 | # 3. Run analyze-traffic.pl on the logged data (you can pass the |
---|
33 | # filename as an argument or feed the data on standard input). |
---|
34 | # You probably want to pipe it into a file, `less`, or `tail` as the |
---|
35 | # output is a line for every group that has received an article |
---|
36 | # according the input data. |
---|
37 | # |
---|
38 | # There are some options hardcoded into the script below, under |
---|
39 | # "Constants" -- check for customization, if you like. |
---|
40 | # |
---|
41 | # 4. Be sure to comment out the newsfeeds entry when done, or set |
---|
42 | # up some sort of log rotation, or INN will eventually fill up your |
---|
43 | # disk... |
---|
44 | # |
---|
45 | ######################################################################## |
---|
46 | # |
---|
47 | # Implementation notes and known bugs: |
---|
48 | # |
---|
49 | # - We try (if $USE_ALL_GROUPS is set, below) to count crossposted |
---|
50 | # towards each listed group (even ones not carried on the server!), |
---|
51 | # but since some articles have funky Newsgroups headers, that can |
---|
52 | # backfire. So parsing can fail, which usually results in the |
---|
53 | # relevant line being skipped, but occasionally can cause Perl to |
---|
54 | # issue warnings (and perhaps produce funny things in the output). |
---|
55 | # |
---|
56 | # A workaround would be to repeat e.g. the Message-ID at the end of |
---|
57 | # the intput format (i.e. WgsbmnNm), and then the script could read as |
---|
58 | # many lines as necessary until that ad hoc end-of-record marker |
---|
59 | # appeared. I haven't found a need for this yet, though. |
---|
60 | # |
---|
61 | # - The input format is a sequence of lines, each containing a number of |
---|
62 | # space-separated fields. Check newsfeeds(5) for what the semantics |
---|
63 | # are, but an example line (wrapped), for reference, looks like: |
---|
64 | # |
---|
65 | # rec.aviation.military [space] |
---|
66 | # news-out.maxwell.syr.edu [space] |
---|
67 | # 2796 [space] |
---|
68 | # <3Jvua.104184$My6.1642017@twister.tampabay.rr.com> [space] |
---|
69 | # @030247454E45524C31000016AD3100000004@ [space] |
---|
70 | # rec.aviation.military,rec.travel.usa-canada, [no space here] |
---|
71 | # sci.electronics.design,sci.econ,sci.environment |
---|
72 | # |
---|
73 | # - The output format is a sequence of lines, one for each newsgroup, |
---|
74 | # with three tab-separated fields. They are sorted by either the |
---|
75 | # second or third field, depending on $SORT_BY_SIZE, below. The first |
---|
76 | # field is the name of the newsgroup. The second is the total number |
---|
77 | # of articles appearing in that newsgroup followed by, in parentheses, |
---|
78 | # the short name of the peer (see about $TLD_REGEX below) responsible |
---|
79 | # for the most articles and the percentage it made up. The third is |
---|
80 | # the total number of kilobytes of (accepted) traffic in that |
---|
81 | # newsgroup, followed similarly by the peer responsible for the most |
---|
82 | # traffic in that group. It looks something like this: |
---|
83 | # |
---|
84 | # news.lists.filters 1057 arts (63% syr) 7105.9 KB (36% cox) |
---|
85 | # |
---|
86 | # The short names are made by taking the last component of the |
---|
87 | # (dot-separated) peer name that doesn't match /$TLD_REGEX/. The idea |
---|
88 | # is that, for example, "isc.org" would be listed as "isc", and |
---|
89 | # "demon.co.uk" would be listed as "demon". Adjust $TLD_REGEX as |
---|
90 | # needed to trim the top-level domains in your part of the world. |
---|
91 | # |
---|
92 | # If your peers have very long short names, the output may look |
---|
93 | # somewhat funny. Similar things can happen with newsgroup names, so |
---|
94 | # those longer than $FIELD1_WIDTH will be truncated to fit. (You can |
---|
95 | # set $FIELD1_WIDTH to '' to skip this truncation, in which case the |
---|
96 | # first column will not be space-padded and the output will look a bit |
---|
97 | # ragged.) |
---|
98 | # |
---|
99 | ######################################################################## |
---|
100 | # |
---|
101 | # Constants: |
---|
102 | |
---|
103 | my $USE_ALL_GROUPS = 1; # if 0, use only group article is stored under |
---|
104 | my $SORT_BY_SIZE = 1; # if 0, sort output by number of articles |
---|
105 | my $FIELD1_WIDTH = 30; # maximum length of newsgroup name, '' for none |
---|
106 | |
---|
107 | my $TLD_REGEX = '^(?:com|net|org|edu|gov|mil|ac|co|uk|au|ca|de)$'; |
---|
108 | # feel free to add any others as needed |
---|
109 | |
---|
110 | |
---|
111 | ######################################################################## |
---|
112 | |
---|
113 | use strict; |
---|
114 | |
---|
115 | my %stats; |
---|
116 | |
---|
117 | while( <> ) { |
---|
118 | my ($group, $peer, $bytes, $id, $token, @Newsgroups) = split; |
---|
119 | next unless ($USE_ALL_GROUPS ? @Newsgroups : $token); # bad input line |
---|
120 | |
---|
121 | my @groups = map { split /\s*,\s*/ } @Newsgroups; |
---|
122 | |
---|
123 | foreach ($USE_ALL_GROUPS && @groups ? @groups : $group) { |
---|
124 | my $s = $stats{$_} ||= { count => 0, |
---|
125 | bytes => 0, |
---|
126 | peers => {}, |
---|
127 | }; |
---|
128 | |
---|
129 | $s->{count}++; |
---|
130 | $s->{bytes} += $bytes; |
---|
131 | $s->{peers}->{$peer}->{count}++; |
---|
132 | $s->{peers}->{$peer}->{bytes} += $bytes; |
---|
133 | } |
---|
134 | } |
---|
135 | |
---|
136 | |
---|
137 | my $f = $SORT_BY_SIZE ? 'bytes' : 'count'; |
---|
138 | |
---|
139 | foreach (sort { $stats{$a}->{$f} <=> $stats{$b}->{$f} } (keys %stats)) { |
---|
140 | my %s = %{$stats{$_}}; |
---|
141 | |
---|
142 | my ($topcount,$topcountwho) = &max('count', $s{peers}); |
---|
143 | my ($topbytes,$topbyteswho) = &max('bytes', $s{peers}); |
---|
144 | $topcountwho = &trim($topcountwho); |
---|
145 | $topbyteswho = &trim($topbyteswho); |
---|
146 | my $countf = int(100 * $topcount / $s{count}); |
---|
147 | my $bytesf = int(100 * $topbytes / $s{bytes}); |
---|
148 | my $kb = 0.1 * int($s{bytes} * 10 / 1024); |
---|
149 | |
---|
150 | my $ng = $FIELD1_WIDTH eq '' ? $_ : substr($_,0,$FIELD1_WIDTH); |
---|
151 | print +(sprintf("%-${FIELD1_WIDTH}s\t", $ng)), |
---|
152 | "$s{count} arts ($countf% $topcountwho)\t", |
---|
153 | "${kb} KB ($bytesf% $topbyteswho)\n"; |
---|
154 | } |
---|
155 | |
---|
156 | 1; |
---|
157 | |
---|
158 | sub trim { |
---|
159 | my @parts = split(/\./, $_[0]); |
---|
160 | my $part; |
---|
161 | while( defined($part = pop(@parts)) ) { |
---|
162 | last unless $part =~ /$TLD_REGEX/o; |
---|
163 | } |
---|
164 | return defined($part) ? $part : $_[0]; |
---|
165 | } |
---|
166 | |
---|
167 | sub max { |
---|
168 | my $x = 0; |
---|
169 | my $who; |
---|
170 | my ($field, $listref) = @_; |
---|
171 | |
---|
172 | while( my ($peer, $stats) = each %{$listref} ) { |
---|
173 | if( $stats->{$field} > $x ) { |
---|
174 | $x = $stats->{$field}; |
---|
175 | $who = $peer; |
---|
176 | } |
---|
177 | } |
---|
178 | |
---|
179 | return ($x, $who); |
---|
180 | } |
---|