## Regular Expressions

In [1]:
import re

In [2]:
s0 = "No full dates here, just 02/15"
s1 = "02/14/2024 is a date"
s2 = "Another date is 12/25/2024"
s3 = "Halloween is 10/31/2024 and Thanksgiving is 11/28/2024"

'Halloween is 10/31/2024 and Thanksgiving is 11/28/2024'

In [3]:
re.match(r'\d+/\d+/\d+',s1)

<re.Match object; span=(0, 10), match='02/14/2024'>

In [4]:
re.match(r'\d+/\d+/\d+',s0)

In [5]:
if re.match(r'\d+/\d+/\d+',s0):
    print("GOT A DATE")
else:
    print("NO DATE")

NO DATE


In [6]:
re.match(r'\d+/\d+/\d+',s2)

In [7]:
re.search(r'\d+/\d+/\d+',s2)

<re.Match object; span=(16, 26), match='12/25/2024'>

In [8]:
re.search(r'\d+/\d+/\d+',s3)

<re.Match object; span=(13, 23), match='10/31/2024'>

In [9]:
re.findall(r'\d+/\d+/\d+',s3)

['10/31/2024', '11/28/2024']

In [10]:
for match in re.finditer(r'\d+/\d+/\d+',s3):
    print(match)

<re.Match object; span=(13, 23), match='10/31/2024'>
<re.Match object; span=(44, 54), match='11/28/2024'>


### Groups in Regex

In [11]:
match = re.search(r'\d+/\d+/\d+',s3)

<re.Match object; span=(13, 23), match='10/31/2024'>

In [12]:
match.group(0)

'10/31/2024'

In [13]:
s5 = "Halloween is 10/31/2024 and Thanksgiving is 11/28/2024"
re.findall(r'\d+/\d+/\d+',s5)

['10/31/2024', '11/28/2024']

In [14]:
for match in re.finditer(r'\d+/\d+/\d+',s3):
    print(match.group(0))

10/31/2024
11/28/2024


In [15]:
s5 = "Halloween is 10/31/2024/11/28 and Thanksgiving is 11/28/2024"
re.findall(r'\d+/\d+/\d+',s5)

['10/31/2024', '11/28/2024']

In [16]:
for match in re.finditer(r'(\d+)/(\d+)/(\d+)',s3):
    print(match.groups())

('10', '31', '2024')
('11', '28', '2024')


In [17]:
for match in re.finditer(r'\d+/\d+/(\d+)',s3):
    print(match.group(1))

2024
2024


In [18]:
for match in re.finditer(r'(\d+)/(\d+)/(\d+)',s3):
    print(match.group(1), match.group(3))

10 2024
11 2024


In [19]:
s21 = "Date 3/1/2024 and 10/31/2024"
for match in re.finditer(r'(\d+)/(\d+)/(\d+)',s21):
    print('{2}-{0:02d}-{1:02d}'.format(*[int(x) for x in match.groups()]))

2024-03-01
2024-10-31


In [20]:
s3

'Halloween is 10/31/2024 and Thanksgiving is 11/28/2024'

In [21]:
re.sub(r'(\d+)/(\d+)/(\d+)',r'a holiday',s3)

'Halloween is a holiday and Thanksgiving is a holiday'

In [22]:
re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',s3)

'Halloween is 2024-10-31 and Thanksgiving is 2024-11-28'

In [23]:
s6 = "New Years Day was 1/1/2024"
re.sub(r'(\d+)/(\d+)/(\d+)',r'\3-\1-\2',s6)

'New Years Day was 2024-1-1'

In [24]:
re.sub(r'(\d+)/(\d+)/(\d+)',lambda m: 
       f'{m.group(3)}-{m.group(1):>02}-{int(m.group(2)):02d}',s6)

'New Years Day was 2024-01-01'

In [25]:
re.sub(r'(\d+)/(\d+)/(\d+)',lambda m: 
       f'{m.group(3)}-{int(m.group(1)):02d}-{int(m.group(2)):02d}',s3)

'Halloween is 2024-10-31 and Thanksgiving is 2024-11-28'

## Files

In [26]:
# output
"hi" + " there"

'hi there'

In [27]:
# stderr
print "hi"

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(...)? (1580183791.py, line 2)

In [28]:
print("hi")

hi


In [29]:
# print the whole file
# !cat huck-finn.txt

In [30]:
!head -10 huck-finn.txt

﻿

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Clemens)

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.net



In [31]:
!tail -10 huck-finn.txt


Most people start at our Web site which has the main PG search facility:

http://www.gutenberg.net

This Web site includes information about Project Gutenberg-tm, including
how to make donations to the Project Gutenberg Literary Archive
Foundation, how to help produce our new eBooks, and how to subscribe to
our email newsletter to hear about new eBooks.



In [32]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line)

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete

Title: Adventures of Huckleberry Finn, Complete

"Don't put your feet up there, Huckleberry;" and "Don't scrunch up

like that, Huckleberry--set up straight;" and pretty soon she would

say, "Don't gap and stretch like that, Huckleberry--why don't you try to

and crossed me off. She says, "Take your hands away, Huckleberry; what

Huckleberry; we'll come down to the village on her."

End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,



In [33]:
f = open('huck-finn.txt')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [34]:
f = open('huck-finn.txt')
for i, line in enumerate(f):
    if i > 20:
        break
    print(line.strip())

﻿

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Clemens)

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.net

Title: Adventures of Huckleberry Finn, Complete

Author: Mark Twain (Samuel Clemens)

Release Date: August 20, 2006 [EBook #76]

Last Updated: April 18, 2015]

Language: English




In [35]:
f.close()

In [36]:
next(iter(f))

ValueError: I/O operation on closed file.

In [37]:
with open('huck-finn.txt') as f:
    for line in f:
        if 'Huckleberry' in line:
            print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [38]:
all_lines = open('huck-finn.txt').readlines()
len(all_lines)

12361

In [39]:
all_lines[0:10]

['\ufeff\n',
 '\n',
 'The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
 'by Mark Twain (Samuel Clemens)\n',
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with almost\n',
 'no restrictions whatsoever. You may copy it, give it away or re-use\n',
 'it under the terms of the Project Gutenberg License included with this\n',
 'eBook or online at www.gutenberg.net\n',
 '\n']

In [40]:
all_lines = open('huck-finn.txt', encoding='utf-8-sig').readlines()
all_lines[0:10]

['\n',
 '\n',
 'The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
 'by Mark Twain (Samuel Clemens)\n',
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with almost\n',
 'no restrictions whatsoever. You may copy it, give it away or re-use\n',
 'it under the terms of the Project Gutenberg License included with this\n',
 'eBook or online at www.gutenberg.net\n',
 '\n']

In [41]:
initial_str = open('huck-finn.txt', encoding='utf-8-sig').read(100)

'\n\nThe Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\nby Mark Twain (Samuel Clem'

In [42]:
initial_str = open('huck-finn.txt', encoding='utf-8').read(100)

'\ufeff\n\nThe Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\nby Mark Twain (Samuel Cle'

In [43]:
print(initial_str)

﻿

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
by Mark Twain (Samuel Cle


In [44]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,


In [45]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
f = open('huck-finn.txt', 'r')
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,
"George Peters, mum."
"Well, try to remember it, George.  Don't forget and tell me it's
Elexander before you go, and then get out by saying it's George
Sarah Mary Williams George Elexander Peters, and if you get into trouble
"My George!  It's the beatenest thing I ever struck.  And _then_ what
"George Jackson, sir."
"George Jackson, sir.  I'm only a boy."
Bob and Tom, some of you, and fetch the guns.  George Jackson, is there
"Now, George Jackson, do you know the Shepherdso

In [46]:
f = open('huck-finn.txt', 'r')
for line in f:
    if 'Huckleberry' in line:
        print(line.strip())
f.seek(0)
for line in f:
    if "George" in line:
        print(line.strip())

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
and crossed me off. She says, "Take your hands away, Huckleberry; what
Huckleberry; we'll come down to the village on her."
End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,
"George Peters, mum."
"Well, try to remember it, George.  Don't forget and tell me it's
Elexander before you go, and then get out by saying it's George
Sarah Mary Williams George Elexander Peters, and if you get into trouble
"My George!  It's the beatenest thing I ever struck.  And _then_ what
"George Jackson, sir."
"George Jackson, sir.  I'm only a boy."
Bob and Tom, some of you, and fetch the guns.  George Jackson, is there
"Now, George Jackson, do you know the Shepherdso

In [47]:
f = open('huck-finn.txt', 'r')
huckleberry = []
george = []
for i, line in enumerate(f):
    if 'Huckleberry' in line:
        huckleberry.append(line.strip())
    if 'George' in line:
        george.append(line.strip())
huckleberry, george

(['The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete',
  'Title: Adventures of Huckleberry Finn, Complete',
  '"Don\'t put your feet up there, Huckleberry;" and "Don\'t scrunch up',
  'like that, Huckleberry--set up straight;" and pretty soon she would',
  'say, "Don\'t gap and stretch like that, Huckleberry--why don\'t you try to',
  'and crossed me off. She says, "Take your hands away, Huckleberry; what',
  'Huckleberry; we\'ll come down to the village on her."',
  'End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,'],
 ['"George Peters, mum."',
  '"Well, try to remember it, George. \xa0Don\'t forget and tell me it\'s',
  "Elexander before you go, and then get out by saying it's George",
  'Sarah Mary Williams George Elexander Peters, and if you get into trouble',
  '"My George! \xa0It\'s the beatenest thing I ever struck. \xa0And _then_ what',
  '"George Jackson, sir."',
  '"George Jackson, sir. \xa0I\'m only a boy."',
  'Bob and Tom, some of

In [48]:
!head -n 10 persons_of_concern.csv

"Extracted from the UNHCR Population Statistics Reference Database","United Nations High Commissioner for Refugees"
"Date extracted: 2015-09-18 04:36:55 +02:00"

Year,"Country / territory of asylum/residence",Origin,"Refugees (incl. refugee-like situations)","Asylum-seekers (pending cases)","Returned refugees","Internally displaced persons (IDPs)","Returned IDPs","Stateless persons","Others of concern","Total Population"
1951,Australia,Various/Unknown,180000,,,,,,,180000
1951,Austria,Various/Unknown,282000,,,,,,,282000
1951,Belgium,Various/Unknown,55000,,,,,,,55000
1951,Canada,Various/Unknown,168511,,,,,,,168511
1951,Switzerland,Various/Unknown,10000,,,,,,,10000
1951,Germany,Various/Unknown,265000,,,,,,,265000


In [49]:
import csv
with open('persons_of_concern.csv', 'r') as f:
    for i in range(3): # skip first three lines
        next(f)
    reader = csv.reader(f)
    records = [r for r in reader] # r is a list
records[:4]

[['Year',
  'Country / territory of asylum/residence',
  'Origin',
  'Refugees (incl. refugee-like situations)',
  'Asylum-seekers (pending cases)',
  'Returned refugees',
  'Internally displaced persons (IDPs)',
  'Returned IDPs',
  'Stateless persons',
  'Others of concern',
  'Total Population'],
 ['1951',
  'Australia',
  'Various/Unknown',
  '180000',
  '',
  '',
  '',
  '',
  '',
  '',
  '180000'],
 ['1951',
  'Austria',
  'Various/Unknown',
  '282000',
  '',
  '',
  '',
  '',
  '',
  '',
  '282000'],
 ['1951',
  'Belgium',
  'Various/Unknown',
  '55000',
  '',
  '',
  '',
  '',
  '',
  '',
  '55000']]

In [50]:
for i in range(1,11):
    print(dict(zip(records[0], records[i])))

{'Year': '1951', 'Country / territory of asylum/residence': 'Australia', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '180000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': '', 'Stateless persons': '', 'Others of concern': '', 'Total Population': '180000'}
{'Year': '1951', 'Country / territory of asylum/residence': 'Austria', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '282000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': '', 'Stateless persons': '', 'Others of concern': '', 'Total Population': '282000'}
{'Year': '1951', 'Country / territory of asylum/residence': 'Belgium', 'Origin': 'Various/Unknown', 'Refugees (incl. refugee-like situations)': '55000', 'Asylum-seekers (pending cases)': '', 'Returned refugees': '', 'Internally displaced persons (IDPs)': '', 'Returned IDPs': 

In [51]:
import csv
with open('persons_of_concern.csv', 'r') as f:
    for i in range(3): # skip first three lines
        next(f)
    reader = csv.DictReader(f)
    records = [r for r in reader] # r is a dict
records[:2]

[{'Year': '1951',
  'Country / territory of asylum/residence': 'Australia',
  'Origin': 'Various/Unknown',
  'Refugees (incl. refugee-like situations)': '180000',
  'Asylum-seekers (pending cases)': '',
  'Returned refugees': '',
  'Internally displaced persons (IDPs)': '',
  'Returned IDPs': '',
  'Stateless persons': '',
  'Others of concern': '',
  'Total Population': '180000'},
 {'Year': '1951',
  'Country / territory of asylum/residence': 'Austria',
  'Origin': 'Various/Unknown',
  'Refugees (incl. refugee-like situations)': '282000',
  'Asylum-seekers (pending cases)': '',
  'Returned refugees': '',
  'Internally displaced persons (IDPs)': '',
  'Returned IDPs': '',
  'Stateless persons': '',
  'Others of concern': '',
  'Total Population': '282000'}]

## Writing Files

In [52]:
f = open('huck-finn.txt', 'r')
huckleberry = []
george = []
for i, line in enumerate(f):
    if 'Huckleberry' in line:
        huckleberry.append(line)
    if 'George' in line:
        george.append(line)
huckleberry, george

(['The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
  'Title: Adventures of Huckleberry Finn, Complete\n',
  '"Don\'t put your feet up there, Huckleberry;" and "Don\'t scrunch up\n',
  'like that, Huckleberry--set up straight;" and pretty soon she would\n',
  'say, "Don\'t gap and stretch like that, Huckleberry--why don\'t you try to\n',
  'and crossed me off. She says, "Take your hands away, Huckleberry; what\n',
  'Huckleberry; we\'ll come down to the village on her."\n',
  'End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,\n'],
 ['"George Peters, mum."\n',
  '"Well, try to remember it, George. \xa0Don\'t forget and tell me it\'s\n',
  "Elexander before you go, and then get out by saying it's George\n",
  'Sarah Mary Williams George Elexander Peters, and if you get into trouble\n',
  '"My George! \xa0It\'s the beatenest thing I ever struck. \xa0And _then_ what\n',
  '"George Jackson, sir."\n',
  '"George Jackson, sir. \xa0I\'m only a boy

In [53]:
huckleberry

['The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete\n',
 'Title: Adventures of Huckleberry Finn, Complete\n',
 '"Don\'t put your feet up there, Huckleberry;" and "Don\'t scrunch up\n',
 'like that, Huckleberry--set up straight;" and pretty soon she would\n',
 'say, "Don\'t gap and stretch like that, Huckleberry--why don\'t you try to\n',
 'and crossed me off. She says, "Take your hands away, Huckleberry; what\n',
 'Huckleberry; we\'ll come down to the village on her."\n',
 'End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,\n']

In [54]:
outf = open('huck-finn-lines.txt','w')
for i, line in enumerate(huckleberry):
    print(line)
    outf.write(line)
    if i > 3:
        raise Exception("Failure")
# outf.close()

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete

Title: Adventures of Huckleberry Finn, Complete

"Don't put your feet up there, Huckleberry;" and "Don't scrunch up

like that, Huckleberry--set up straight;" and pretty soon she would

say, "Don't gap and stretch like that, Huckleberry--why don't you try to



Exception: Failure

In [55]:
# file is empty!
!cat huck-finn-lines.txt

In [56]:
with open('huck-finn-lines.txt','w') as outf:
    for i, line in enumerate(huckleberry):
        print(line)
        outf.write(line)
        if i > 3:
            raise Exception("Failure")

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete

Title: Adventures of Huckleberry Finn, Complete

"Don't put your feet up there, Huckleberry;" and "Don't scrunch up

like that, Huckleberry--set up straight;" and pretty soon she would

say, "Don't gap and stretch like that, Huckleberry--why don't you try to



Exception: Failure

In [57]:
# file is not empty, first five lines are there!
!cat huck-finn-lines.txt

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to


In [58]:
with open('huck-finn-lines.txt','w') as outf:
    for i, line in enumerate(huckleberry):
        print(line.strip(), file=outf) # lines with new line
        if i > 3:
            raise Exception("Failure")

Exception: Failure

In [59]:
# file is not empty, first five lines are there!
!cat huck-finn-lines.txt

The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
Title: Adventures of Huckleberry Finn, Complete
"Don't put your feet up there, Huckleberry;" and "Don't scrunch up
like that, Huckleberry--set up straight;" and pretty soon she would
say, "Don't gap and stretch like that, Huckleberry--why don't you try to
