## Regular Expressions

In [1]:
import re

In [2]:
s0 = "No full dates here, just 02/15"
s1 = "02/14/2021 is a date"
s2 = "Another date is 12/25/2020"
s3 = "April Fools' Day is 4/1/2021 and May the Fourth is 5/4/2021"

"April Fools' Day is 4/1/2021 and May the Fourth is 5/4/2021"

In [3]:
re.match(r'\d+/\d+/\d+',s1)

<re.Match object; span=(0, 10), match='02/14/2021'>

In [4]:
re.match(r'\d+/\d+/\d+',s0)

In [5]:
if re.match(r'\d+/\d+/\d+',s0):
    print("GOT A DATE")
else:
    print("NO DATE")

NO DATE


In [6]:
s4 = 'Another date is 12/25'

'Another date is 12/25'

In [7]:
re.match(r'\d+/\d+/\d+',s2)

In [8]:
re.search(r'\d+/\d+/\d+',s2)

<re.Match object; span=(16, 26), match='12/25/2020'>

In [9]:
s3

"April Fools' Day is 4/1/2021 and May the Fourth is 5/4/2021"

In [10]:
match = re.search(r'\d+/\d+/\d+',s3)

<re.Match object; span=(20, 28), match='4/1/2021'>

In [11]:
match.group(0)

'4/1/2021'

In [12]:
re.findall(r'\d+/\d+/\d+',s3)

['4/1/2021', '5/4/2021']

In [13]:
s5 = "April Fools' Day is 4/1/2021/234/234 and May the Fourth is 5/4/2021"
re.findall(r'\d+/\d+/\d+',s5)

['4/1/2021', '5/4/2021']

In [14]:
for match in re.finditer(r'\d+/\d+/\d+',s3):
    print(match)

<re.Match object; span=(20, 28), match='4/1/2021'>
<re.Match object; span=(51, 59), match='5/4/2021'>


In [15]:
for match in re.finditer(r'(\d+)/(\d+)/(\d+)',s3):
    print(match.groups())

('4', '1', '2021')
('5', '4', '2021')


In [16]:
for match in re.finditer(r'\d+/\d+/(\d+)',s3):
    print(match.group(0))

4/1/2021
5/4/2021


In [17]:
# 2021-04-01
for match in re.finditer(r'(\d+)/(\d+)/(\d+)',s3):
    print('{2}-{0:02d}-{1:02d}'.format(*[int(x) for x in match.groups()]))

2021-04-01
2021-05-04


In [18]:
re.sub(r'(\d+)/(\d+)/(\d+)',r'THE YEAR',s3)

"April Fools' Day is THE YEAR and May the Fourth is THE YEAR"

In [19]:
re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',s3)

"April Fools' Day is 2021-4-1 and May the Fourth is 2021-5-4"

In [20]:
re.sub(r'(\d+)/(\d+)/(\d+)',lambda m: 
       f'{m.group(3)}-{int(m.group(1)):02d}-{int(m.group(2)):02d}',s3)

"April Fools' Day is 2021-04-01 and May the Fourth is 2021-05-04"

## Files

In [36]:
!cat huck-finn-lines.txt

The Project Gutenberg EBook of Adventures of Huckleberry Finn, CompleteTitle: Adventures of Huckleberry Finn, Complete"Don't put your feet up there, Huckleberry;" and "Don't scrunch uplike that, Huckleberry--set up straight;" and pretty soon she wouldsay, "Don't gap and stretch like that, Huckleberry--why don't you try to

In [37]:
!head -10 huck-finn.txt

﻿

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Clemens)

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.net



In [38]:
!tail -10 huck-finn.txt


Most people start at our Web site which has the main PG search facility:

http://www.gutenberg.net

This Web site includes information about Project Gutenberg-tm, including
how to make donations to the Project Gutenberg Literary Archive
Foundation, how to help produce our new eBooks, and how to subscribe to
our email newsletter to hear about new eBooks.



In [23]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line)

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete

Title: Adventures of Huckleberry Finn, Complete

"Don't put your feet up there, Huckleberry;" and "Don't scrunch up

like that, Huckleberry--set up straight;" and pretty soon she would

say, "Don't gap and stretch like that, Huckleberry--why don't you try to

and crossed me off. She says, "Take your hands away, Huckleberry; what

Huckleberry; we'll come down to the village on her."

End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,



In [24]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [25]:
with open('huck-finn.txt') as f:
    for line in f:
        if 'Huckleberry' in line:
            print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [26]:
all_lines = open('huck-finn.txt').readlines()
len(all_lines)

12361

In [27]:
all_lines[0:10]

['\ufeff\n',
 '\n',
 'The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
 'by Mark Twain (Samuel Clemens)\n',
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with almost\n',
 'no restrictions whatsoever. You may copy it, give it away or re-use\n',
 'it under the terms of the Project Gutenberg License included with this\n',
 'eBook or online at www.gutenberg.net\n',
 '\n']

In [28]:
all_lines = open('huck-finn.txt', encoding='utf-8-sig').readlines()
all_lines[0:10]

['\n',
 '\n',
 'The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
 'by Mark Twain (Samuel Clemens)\n',
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with almost\n',
 'no restrictions whatsoever. You may copy it, give it away or re-use\n',
 'it under the terms of the Project Gutenberg License included with this\n',
 'eBook or online at www.gutenberg.net\n',
 '\n']

In [29]:
initial_str = open('huck-finn.txt', encoding='utf-8-sig').read(100)

'\n\nThe Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\nby Mark Twain (Samuel Clem'

In [30]:
print(initial_str)



The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Clem


In [31]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [32]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
f = open('huck-finn.txt', 'r')
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,
"George Peters, mum."
"Well, try to remember it, George.  Don't forget and tell me it's
Elexander before you go, and then get out by saying it's George
Sarah Mary Williams George Elexander Peters, and if you get into trouble
"My George!  It's the beatenest thing I ever struck.  And _then_ what
"George Jackson, sir."
"George Jackson, sir.  I'm only a boy."
Bob and Tom, some of you, and fetch the guns.  George Jackson, is there
"Now, George Jackson, do you know the Shepherdso

In [33]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
f.seek(0)
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,
"George Peters, mum."
"Well, try to remember it, George.  Don't forget and tell me it's
Elexander before you go, and then get out by saying it's George
Sarah Mary Williams George Elexander Peters, and if you get into trouble
"My George!  It's the beatenest thing I ever struck.  And _then_ what
"George Jackson, sir."
"George Jackson, sir.  I'm only a boy."
Bob and Tom, some of you, and fetch the guns.  George Jackson, is there
"Now, George Jackson, do you know the Shepherdso

In [34]:
f = open('huck-finn.txt', 'r')
huckleberry = []
george = []
for i, line in enumerate(f):
    if 'Huckleberry' in line:
        huckleberry.append(line.strip())
    if 'George' in line:
        george.append(line.strip())
huckleberry, george

(['The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete',
  'Title: Adventures of Huckleberry Finn, Complete',
  '"Don\'t put your feet up there, Huckleberry;" and "Don\'t scrunch up',
  'like that, Huckleberry--set up straight;" and pretty soon she would',
  'say, "Don\'t gap and stretch like that, Huckleberry--why don\'t you try to',
  'and crossed me off. She says, "Take your hands away, Huckleberry; what',
  'Huckleberry; we\'ll come down to the village on her."',
  'End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,'],
 ['"George Peters, mum."',
  '"Well, try to remember it, George. \xa0Don\'t forget and tell me it\'s',
  "Elexander before you go, and then get out by saying it's George",
  'Sarah Mary Williams George Elexander Peters, and if you get into trouble',
  '"My George! \xa0It\'s the beatenest thing I ever struck. \xa0And _then_ what',
  '"George Jackson, sir."',
  '"George Jackson, sir. \xa0I\'m only a boy."',
  'Bob and Tom, some of

In [39]:
import csv
with open('persons_of_concern.csv', 'r') as f:
    for i in range(3): # skip first three lines
        next(f)
    reader = csv.reader(f)
    records = [r for r in reader] # r is a list
records[:4]

[['Year',
  'Country / territory of asylum/residence',
  'Origin',
  'Refugees (incl. refugee-like situations)',
  'Asylum-seekers (pending cases)',
  'Returned refugees',
  'Internally displaced persons (IDPs)',
  'Returned IDPs',
  'Stateless persons',
  'Others of concern',
  'Total Population'],
 ['1951',
  'Australia',
  'Various/Unknown',
  '180000',
  '',
  '',
  '',
  '',
  '',
  '',
  '180000'],
 ['1951',
  'Austria',
  'Various/Unknown',
  '282000',
  '',
  '',
  '',
  '',
  '',
  '',
  '282000'],
 ['1951',
  'Belgium',
  'Various/Unknown',
  '55000',
  '',
  '',
  '',
  '',
  '',
  '',
  '55000']]

In [40]:
for i in range(1,11):
    print(dict(zip(records[0], records[i])))

{'Year': '1951', 'Country / territory of asylum/residence': 'Australia', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '180000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': '', 'Stateless persons': '', 'Others of concern': '', 'Total Population': '180000'}
{'Year': '1951', 'Country / territory of asylum/residence': 'Austria', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '282000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': '', 'Stateless persons': '', 'Others of concern': '', 'Total Population': '282000'}
{'Year': '1951', 'Country / territory of asylum/residence': 'Belgium', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '55000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': 

In [41]:
import csv
with open('persons_of_concern.csv', 'r') as f:
    for i in range(3): # skip first three lines
        next(f)
    reader = csv.DictReader(f)
    records = [r for r in reader] # r is a dict
records[:2]

[{'Year': '1951',
  'Country / territory of asylum/residence': 'Australia',
  'Origin': 'Various/Unknown',
  'Refugees (incl. refugee-like situations)': '180000',
  'Asylum-seekers (pending cases)': '',
  'Returned refugees': '',
  'Internally displaced persons (IDPs)': '',
  'Returned IDPs': '',
  'Stateless persons': '',
  'Others of concern': '',
  'Total Population': '180000'},
 {'Year': '1951',
  'Country / territory of asylum/residence': 'Austria',
  'Origin': 'Various/Unknown',
  'Refugees (incl. refugee-like situations)': '282000',
  'Asylum-seekers (pending cases)': '',
  'Returned refugees': '',
  'Internally displaced persons (IDPs)': '',
  'Returned IDPs': '',
  'Stateless persons': '',
  'Others of concern': '',
  'Total Population': '282000'}]

In [42]:
import json
with open('persons_of_concern.json', 'w') as f:
    json.dump(records, f)

In [43]:
outf = open('huck-finn-lines.txt','w')
for i, line in enumerate(huckleberry):
    print(line)
    outf.write(line)
    if i > 3:
        raise Exception("Failure")

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to


Exception: Failure

In [44]:
# file is empty!
!cat huck-finn-lines.txt

In [45]:
with open('huck-finn-lines.txt','w') as outf:
    for i, line in enumerate(huckleberry):
        outf.write(line)
        if i > 3:
            raise Exception("Failure")

Exception: Failure

In [46]:
# file has lines written until exception
!cat huck-finn-lines.txt

The Project Gutenberg EBook of Adventures of Huckleberry Finn, CompleteTitle: Adventures of Huckleberry Finn, Complete"Don't put your feet up there, Huckleberry;" and "Don't scrunch uplike that, Huckleberry--set up straight;" and pretty soon she wouldsay, "Don't gap and stretch like that, Huckleberry--why don't you try to