These are the notes I took while taking the “Data Wrangling with MongoDB” course at Udacity. It tells how to use Python to process CSV, XML, Excel, and how to work with MongoBD. Also some examples for page scraping in Python.
with open(datafile, "r") as f:
titles = string.split(f.readline(), ",")
for line in f:
data.append(create_data(titles,string.split(line, ",")))
i=i+1
if (i>10):
break
return data
import xlrd
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)
data = [[sheet.cell_value(r, col) for col in range(2)] for r in range(sheet.nrows)]
minVal = data[1][1]
minIndex = 1
import csv
with open(datafile,'rb') as f:
reader = csv.reader(f)#, delimiter=',', quotechar='|'
name = reader.next()[1]
print name
reader.next()
while True:
try:
data.append(reader.next())
except StopIteration:
break
with open(filename, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter='|')
for row in data:
writer.writerow(row)
import xml.etree.ElementTree as ET
tree = ET.parse(fname)
root = tree.getroot()
for author in root.findall('./fm/bibl/aug/au'):
data = {
"fnm": author.find('fnm').text,
"snm": author.find('snm').text,
"email": author.find('email').text,
}
authors.append(data)
for author in root.findall('./fm/bibl/aug/au'):
insrs = author.findall('./insr')
for insr in insrs:
data["insr"].append(insr.attrib["iid"])
print data
with open(page, "r") as html:
soup = BeautifulSoup(html)
data["eventvalidation"] = soup.find(id="__EVENTVALIDATION")['value']
data["viewstate"] = soup.find(id="__VIEWSTATE")['value']
~~requests.post("http://..", data={...})~~
s = requests.Session()
s.post("http://..", data={...})
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
query = {"manufacturer" : "Porsche"}
return db.autos.find(query)
return db.autos.find_one(query)
return db.autos.find(query, {"_id":0, "name":1})
_id
is included by default, unless specifies like above explicitlydb.autos.insert(auto)
>mongoimport -d examplesdb -c autos --file autos.json
query = {"foundingDate" : {"$gte":datetime(2001,1,1), "$lte" : datetime(2099,12,31)}}
query = {"governmentType":{"$exists" : 1}}
query = {"assembly":{"$in":["Germany","United Kingdom","Japan"]}, "manufacturer":"Ford Motor Company"}
query = {"modelYears":{"$all":[1965, 1966, 1967]}}
query = {"dimensions.width":{"$gt":2.5}}
city = db.cities.update({"name":"Munich",
"country":"Germany"},
{"$set":{"isoCountryCode":"DEU"}})
city = db.cities.update({"name":"Munich",
"country":"Germany"},
{"$unset":{"isoCountryCode":
"blahblah_this is ignored"}})
city = db.cities.update({"name":"Munich",
"country":"Germany"},
{"$set":{"isoCountryCode":"DEU"}},
multi=True)
city = db.cities.remove()
A lot of specific data cleaning and modifying, while copying it from CVS to mongo, row by row. It’s about arachnid (spiders) data set.
db = get_db('twitter')
pipeline = [
{"$group": {"_id":"$source",
"count": {"$sum": 1}}
},
{"$sort": {'count': -1}}]
result = db.tweets.aggregate(pipeline)
pipeline = [{"$match":{"user.time_zone":"Brasilia"}},
{"$match":{"user.statuses_count":{"$gte":100}}},
{"$project":{"followers":"$user.followers_count",
"screen_name":"$user.screen_name",
"tweets":"$user.statuses_count"}},
{"$sort":{"followers":-1}},
{"$limit":1}]
pipeline = [{"$match":{"country":"India"}},
{"$unwind":"$isPartOf"},
{"$group":{"_id":"$isPartOf", "count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":1}]
pipeline = [{"$match":{"country":"India"}},
{"$unwind":"$isPartOf"},
{"$group":{"_id":"$isPartOf",
"avg":{"$avg":"$population"}}
},
{"$group":{"_id":"totalAvg",
"avg":{"$avg":"$avg"}}
}
]
pipeline = [{"$group":
{"_id":"$user.screen_name",
"tweet_texts":{"$push":"$text"},
"count":{"$sum":1}}},
{"$sort":{"count":-1}},
{"$limit":5}
]
db.autos.ensureIndex({"name": 1})
db.autos.ensureIndex(loc_field, direction)
Just building different pipelines.
It’s about OpenStreetMap data set, which can be downloaded in XML from their site. You can download part which you are looking at, or download data of major cities. They also have a very nice wiki.
The data is XML with “node”s and “way”s (way = street, road, etc). The data is human edited, so it contains errors.
import xml.etree.ElementTree as ET
for event, item in ET.iterparse(xml_filename, events=(“start”,))
handle_node(elem)
The non iterative parsing (reading all to memory at once) could go like:
tree = ET.parse(xml_filename)
root = tree.getroot()
for child in root:
handle_node(child)
import re
lower = re.compile(r'^([a-z]|_)*$')
re.findall(lower, string)
m = lower.search(string)
if m:
substring = m.group()
It’s about parsing a XML document, and iterating over XML nodes and creating proper python dictionaries (specified in the task description).
Conclusions
Comments: