165 lines
5.6 KiB
Python
165 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# Add location to the doc mapping produced by nutch
|
|
|
|
import logging
|
|
# import sys
|
|
import os
|
|
import json
|
|
# import datetime
|
|
from urllib.parse import urlparse
|
|
from elasticsearch import Elasticsearch
|
|
# from dateutil.parser import parse as parse_date
|
|
|
|
|
|
def write_file(filename, content):
|
|
# Open a file
|
|
with open(filename, "a+") as fo:
|
|
fo.seek(0)
|
|
if not any(content == x.rstrip('\r\n') for x in fo):
|
|
fo.write(content + '\n')
|
|
# fo.close()
|
|
print("write_file: content not present, writing.")
|
|
else:
|
|
print("write_file: content present, next.")
|
|
|
|
|
|
def convert_year(date):
|
|
d, m, y = date.split('/')
|
|
return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])
|
|
|
|
|
|
def write_json_file(workspace, file, data):
|
|
file = workspace + file
|
|
print(file)
|
|
print(data)
|
|
with open(file, "w") as jsonFile:
|
|
json_contents = json.dumps(data, sort_keys=True, indent=4)
|
|
print(json_contents)
|
|
jsonFile.write(json_contents)
|
|
|
|
|
|
def documents(workspace, fileList):
|
|
print("Entering documents function")
|
|
print("length of fileList: " + str(len(fileList)))
|
|
# for file in fileList:
|
|
for index, file in enumerate(fileList):
|
|
print(index, '--->', file)
|
|
# For each file in the list, read its json and provide a generator
|
|
contents = json.load(open(workspace + file))
|
|
# print(contents)
|
|
yield file, contents
|
|
|
|
|
|
def print_hits(results, facet_masks={}):
|
|
" Simple utility function to print results of a search query. "
|
|
# print(results)
|
|
# print('=' * 80)
|
|
print('Total %d found in %dms' % (results['hits']['total'], results['took']))
|
|
# if results['hits']['hits']:
|
|
# print('-' * 80)
|
|
for hit in results['hits']['hits']:
|
|
print(hit['_score'])
|
|
print(hit['_source']['host'])
|
|
# get created date for a repo and fallback to authored_date for a commit
|
|
# created_at = parse_date(hit['_source'].get('tstamp', hit['_source']['lastModified']))
|
|
# print('/%s/%s/%s (%s): %s' % (
|
|
# hit['_index'], hit['_type'], hit['_id'],
|
|
# created_at.strftime('%Y-%m-%d'),
|
|
# hit['_source']['title'].replace('\n', ' ')))
|
|
|
|
for facet, mask in facet_masks.items():
|
|
print('-' * 80)
|
|
for d in results['facets'][facet]['terms']:
|
|
print(mask % d)
|
|
print('=' * 80)
|
|
print()
|
|
|
|
|
|
def search_es():
|
|
|
|
|
|
# Using the normal 'for each' loop get the domains from the "All good" section.
|
|
# Call a function that searches ES for the docs with the same web/url and for each
|
|
# returned add the location-LatLon. If they were added by nutch then we wouldn't
|
|
# have to change the mapping.
|
|
|
|
def get_domain(office_workspace, office_fileList):
|
|
for file, data in documents(office_workspace, office_fileList):
|
|
tmp = data["location"]
|
|
# print("Office fileList: ", office_fileList)
|
|
# print("Office workspace: ", office_workspace)
|
|
print("File: ", file)
|
|
# print(type(tmp))
|
|
# print(len(tmp))
|
|
Name = data["name"]
|
|
Website = data["web"]
|
|
latitude = data["location"]["lat"]
|
|
longitude = data["location"]["lon"]
|
|
print("lon: ", data["location"]["lon"])
|
|
print("lat: ", data["location"]["lat"])
|
|
parsed_uri = urlparse(Website)
|
|
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
|
|
print("domain: ", domain)
|
|
geo_content = (domain + ' ' + latitude + ' ' + longitude)
|
|
geo_content_named = (Name + ' ' + domain + ' ' + latitude + ' ' + longitude)
|
|
if (latitude == 0 and longitude == 0) and domain == ":///":
|
|
print("Missing both")
|
|
# write_file(firm_missing_both_fo, geo_content_named)
|
|
elif (latitude == '0' and longitude == '0'):
|
|
print("Missing geo")
|
|
# write_file(geodata_missing_geo_fo, geo_content_named)
|
|
elif domain == ":///":
|
|
print("Missing website")
|
|
# write_file(geodata_missing_web_fo, geo_content_named)
|
|
else:
|
|
print("All good")
|
|
# write_file(geodata_fo, geo_content)
|
|
# write_file(geodata_named_fo, geo_content_named)
|
|
yield domain
|
|
|
|
|
|
# get trace logger and set level
|
|
tracer = logging.getLogger('elasticsearch.trace')
|
|
tracer.setLevel(logging.INFO)
|
|
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
|
|
|
|
# workspace_home = '/var/tmp/lawsoc_new/'
|
|
workspace_home = os.getcwd()
|
|
workspace = workspace_home + "/json_out/"
|
|
office_workspace = workspace + "office/"
|
|
geodata_fo = workspace_home + "geodata.txt"
|
|
geodata_named_fo = workspace_home + "geodata-named.txt"
|
|
geodata_missing_both_fo = workspace_home + "geodata_missing_both.txt"
|
|
geodata_missing_web_fo = workspace_home + "geodata_missing_website.txt"
|
|
geodata_missing_geo_fo = workspace_home + "geodata_missing_geo.txt"
|
|
|
|
office_fileList = os.listdir(office_workspace)
|
|
document_index = "index"
|
|
|
|
if __name__ == '__main__':
|
|
# Get a list of files in the workspace...
|
|
es = Elasticsearch()
|
|
# print("Empty search: ")
|
|
# print_hits(es.search(index='index'))
|
|
# for domain in get_domain(office_workspace, office_fileList):
|
|
result = es.search(
|
|
index='index',
|
|
doc_type='doc',
|
|
body={
|
|
'query': {
|
|
'filtered': {
|
|
'query': {
|
|
'match': {'url': 'http://www.renshawderrick.co.uk/'}
|
|
},
|
|
'filter': {
|
|
'not': {
|
|
'term': {'files': 'test_elasticsearch'}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
)
|
|
print_hits(result)
|