Be more careful crawling the stuff

Specify credible user agent while retrieving data from URLs.
Also change default member report update to 30 days instead of week.

Partial bug 1361927

Change-Id: I7f956912c2cc3aa2765a20ed193e1e34ecbf7f91
This commit is contained in:
Ilya Shakhat 2014-10-02 18:24:52 +04:00
parent 9f8d82d801
commit 8d08624711
4 changed files with 18 additions and 3 deletions

View File

@ -18,7 +18,7 @@
# listen_port = 8080
# Number of days to update members
# days_to_update_members = 7
# days_to_update_members = 30
# The address of file with corrections data
# corrections_uri = https://git.openstack.org/cgit/stackforge/stackalytics/plain/etc/corrections.json

View File

@ -29,7 +29,7 @@ OPTS = [
help='The address dashboard listens on'),
cfg.IntOpt('listen-port', default=8080,
help='The port dashboard listens on'),
cfg.IntOpt('days_to_update_members', default=7,
cfg.IntOpt('days_to_update_members', default=30,
help='Number of days to update members'),
cfg.StrOpt('corrections-uri',
default=('https://git.openstack.org/cgit/'

View File

@ -12,6 +12,7 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import re
import time
@ -106,5 +107,7 @@ def log(uri, runtime_storage_inst, days_to_update_members, members_look_ahead):
LOG.debug('New member: %s', member['member_id'])
yield member
time.sleep(random.random() * 5)
LOG.debug('Last_member_index: %s', last_member_index)
runtime_storage_inst.set_by_key('last_member_index', last_member_index)

View File

@ -17,6 +17,7 @@ import cgi
import datetime
import gzip
import json
import random
import re
import time
@ -86,9 +87,20 @@ def check_email_validity(email):
return False
user_agents = [
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) Gecko/20100101 Firefox/32.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_6) AppleWebKit/537.78.2',
'Mozilla/5.0 (Windows NT 6.3; WOW64) Gecko/20100101 Firefox/32.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X) Chrome/37.0.2062.120',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
]
def read_uri(uri):
try:
fd = six.moves.urllib.request.urlopen(uri)
req = six.moves.urllib.request.Request(
url=uri, headers={'User-Agent': random.choice(user_agents)})
fd = six.moves.urllib.request.urlopen(req)
raw = fd.read()
fd.close()
return raw