lawsoc-scraper/create_geodata_file.py

99 lines
3.5 KiB
Python

#!/usr/bin/python3
# For each of the office json files extract the url domain and output with lat
# lon to geodata.txt
import os
import json
from urllib.parse import urlparse
def write_file(filename, content):
# Open a file
with open(filename, "a+") as fo:
fo.seek(0)
if not any(content == x.rstrip('\r\n') for x in fo):
fo.write(content + '\n')
# fo.close()
print("write_file: content not present, writing.")
else:
print("write_file: content present, next.")
def convert_year(date):
d, m, y = date.split('/')
return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])
def write_json_file(workspace, file, data):
file = workspace + file
print(file)
print(data)
with open(file, "w") as jsonFile:
json_contents = json.dumps(data, sort_keys=True, indent=4)
print(json_contents)
jsonFile.write(json_contents)
def documents(workspace, fileList):
print("Entering documents function")
print("length of fileList: " + str(len(fileList)))
# for file in fileList:
for index, file in enumerate(fileList):
print(index, '--->', file)
# For each file in the list, read its json and provide a generator
contents = json.load(open(workspace + file))
# print(contents)
yield file, contents
# workspace_home = '/var/tmp/lawsoc_new/'
workspace_home = os.getcwd()
workspace = workspace_home + "/json_out/"
workspace_out = workspace_home + "/text_files/"
office_workspace = workspace + "office/"
geodata_fo = workspace_out + "geodata.txt"
geodata_named_fo = workspace_out + "geodata-named.txt"
geodata_missing_both_fo = workspace_out + "geodata_missing_both.txt"
geodata_missing_web_fo = workspace_out + "geodata_missing_website.txt"
geodata_missing_geo_fo = workspace_out + "geodata_missing_geo.txt"
if __name__ == '__main__':
# Get a list of files in the workspace...
office_fileList = os.listdir(office_workspace)
for file, data in documents(office_workspace, office_fileList):
tmp = data["location"]
# print("Office fileList: ", office_fileList)
# print("Office workspace: ", office_workspace)
print("File: ", file)
# print(type(tmp))
# print(len(tmp))
Name = data["name"]
Website = data["web"]
latitude = data["location"]["lat"]
longitude = data["location"]["lon"]
print("lon: ", data["location"]["lon"])
print("lat: ", data["location"]["lat"])
parsed_uri = urlparse(Website)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
print("domain: ", domain)
geo_content = (domain + ' ' + latitude + ' ' + longitude)
geo_content_named = (Name + ' \
' + domain + ' \
' + latitude + ' \
' + longitude)
if (latitude == 0 and longitude == 0) and domain == ":///":
print("Missing both")
write_file(geodata_missing_both_fo, geo_content_named)
elif (latitude == '0' and longitude == '0'):
print("Missing geo")
write_file(geodata_missing_geo_fo, geo_content_named)
elif domain == ":///":
print("Missing website")
write_file(geodata_missing_web_fo, geo_content_named)
else:
print("All good")
write_file(geodata_fo, geo_content)
write_file(geodata_named_fo, geo_content_named)