lawsoc-scraper/elasticsearch/query_and_add.py

165 lines
5.6 KiB
Python

#!/usr/bin/env python3
# Add location to the doc mapping produced by nutch
import logging
# import sys
import os
import json
# import datetime
from urllib.parse import urlparse
from elasticsearch import Elasticsearch
# from dateutil.parser import parse as parse_date
def write_file(filename, content):
# Open a file
with open(filename, "a+") as fo:
fo.seek(0)
if not any(content == x.rstrip('\r\n') for x in fo):
fo.write(content + '\n')
# fo.close()
print("write_file: content not present, writing.")
else:
print("write_file: content present, next.")
def convert_year(date):
d, m, y = date.split('/')
return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])
def write_json_file(workspace, file, data):
file = workspace + file
print(file)
print(data)
with open(file, "w") as jsonFile:
json_contents = json.dumps(data, sort_keys=True, indent=4)
print(json_contents)
jsonFile.write(json_contents)
def documents(workspace, fileList):
print("Entering documents function")
print("length of fileList: " + str(len(fileList)))
# for file in fileList:
for index, file in enumerate(fileList):
print(index, '--->', file)
# For each file in the list, read its json and provide a generator
contents = json.load(open(workspace + file))
# print(contents)
yield file, contents
def print_hits(results, facet_masks={}):
" Simple utility function to print results of a search query. "
# print(results)
# print('=' * 80)
print('Total %d found in %dms' % (results['hits']['total'], results['took']))
# if results['hits']['hits']:
# print('-' * 80)
for hit in results['hits']['hits']:
print(hit['_score'])
print(hit['_source']['host'])
# get created date for a repo and fallback to authored_date for a commit
# created_at = parse_date(hit['_source'].get('tstamp', hit['_source']['lastModified']))
# print('/%s/%s/%s (%s): %s' % (
# hit['_index'], hit['_type'], hit['_id'],
# created_at.strftime('%Y-%m-%d'),
# hit['_source']['title'].replace('\n', ' ')))
for facet, mask in facet_masks.items():
print('-' * 80)
for d in results['facets'][facet]['terms']:
print(mask % d)
print('=' * 80)
print()
def search_es():
# Using the normal 'for each' loop get the domains from the "All good" section.
# Call a function that searches ES for the docs with the same web/url and for each
# returned add the location-LatLon. If they were added by nutch then we wouldn't
# have to change the mapping.
def get_domain(office_workspace, office_fileList):
for file, data in documents(office_workspace, office_fileList):
tmp = data["location"]
# print("Office fileList: ", office_fileList)
# print("Office workspace: ", office_workspace)
print("File: ", file)
# print(type(tmp))
# print(len(tmp))
Name = data["name"]
Website = data["web"]
latitude = data["location"]["lat"]
longitude = data["location"]["lon"]
print("lon: ", data["location"]["lon"])
print("lat: ", data["location"]["lat"])
parsed_uri = urlparse(Website)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
print("domain: ", domain)
geo_content = (domain + ' ' + latitude + ' ' + longitude)
geo_content_named = (Name + ' ' + domain + ' ' + latitude + ' ' + longitude)
if (latitude == 0 and longitude == 0) and domain == ":///":
print("Missing both")
# write_file(firm_missing_both_fo, geo_content_named)
elif (latitude == '0' and longitude == '0'):
print("Missing geo")
# write_file(geodata_missing_geo_fo, geo_content_named)
elif domain == ":///":
print("Missing website")
# write_file(geodata_missing_web_fo, geo_content_named)
else:
print("All good")
# write_file(geodata_fo, geo_content)
# write_file(geodata_named_fo, geo_content_named)
yield domain
# get trace logger and set level
tracer = logging.getLogger('elasticsearch.trace')
tracer.setLevel(logging.INFO)
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
# workspace_home = '/var/tmp/lawsoc_new/'
workspace_home = os.getcwd()
workspace = workspace_home + "/json_out/"
office_workspace = workspace + "office/"
geodata_fo = workspace_home + "geodata.txt"
geodata_named_fo = workspace_home + "geodata-named.txt"
geodata_missing_both_fo = workspace_home + "geodata_missing_both.txt"
geodata_missing_web_fo = workspace_home + "geodata_missing_website.txt"
geodata_missing_geo_fo = workspace_home + "geodata_missing_geo.txt"
office_fileList = os.listdir(office_workspace)
document_index = "index"
if __name__ == '__main__':
# Get a list of files in the workspace...
es = Elasticsearch()
# print("Empty search: ")
# print_hits(es.search(index='index'))
# for domain in get_domain(office_workspace, office_fileList):
result = es.search(
index='index',
doc_type='doc',
body={
'query': {
'filtered': {
'query': {
'match': {'url': 'http://www.renshawderrick.co.uk/'}
},
'filter': {
'not': {
'term': {'files': 'test_elasticsearch'}
}
}
}
}
}
)
print_hits(result)