正则表达式是一种强大的文本处理工具,它可以帮助在处理大量文本数据时,快速准确地完成各种任务。在自然语言处理(NLP)领域,正则表达式的应用非常广泛,包括但不限于验证字符串结构、提取子字符串、搜索替换以及分割字符串等。本文将详细介绍正则表达式在NLP中的一些常见应用,并提供相应的Python代码示例。
正则表达式可以用于多种文本处理任务,以下是一些常见的应用场景:
这些任务在处理文本数据或解决NLP问题时非常常见。下面,将通过一些具体的代码示例,来展示正则表达式在NLP任务中的应用。
在Python中,正则表达式相关的操作主要通过re
模块实现。以下是一些常用的正则表达式函数:
re.findall
- 搜索所有匹配给定模式的子串re.sub
- 替换匹配正则表达式的文本re.match
- 从字符串的开始位置匹配正则表达式模式re.search
- 在字符串中搜索正则表达式模式在本文中,将重点使用re.findall
函数来检测模式。
以下是一些使用正则表达式在NLP任务中的代码示例:
def find_url(string):
text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string)
return "".join(text)
example = "I love spending time at https://www.leetcode.com/"
find_url(example)
输出:'https://www.leetcode.com/'
def findEmoji(text):
emo_text = emoji.demojize(text)
line = re.findall(r':(.*?):', emo_text)
return line
example = "I love ⚽ very much 😁"
findEmoji(example)
输出:['soccer_ball', 'beaming_face_with_smiling_eyes']
def findEmail(text):
line = re.findall(r'[w.-]+@[w.-]+', str(text))
return ",".join(line)
example = "Gaurav's gmail is [email protected]"
findEmail(example)
输出:'[email protected]'
def findHash(text):
line = re.findall(r'(?<=#)\w+', text)
return " ".join(line)
example = "#Sushant is trending now in the world"
findHash(example)
输出:'Sushant'
def findAt(text):
line = re.findall(r'(?<=@)\w+', text)
return " ".join(line)
example = "@Ajit, please help me"
findAt(example)
输出:'Ajit'
def findNumber(text):
line = re.findall(r'[0-9]+', text)
return " ".join(line)
example = "8853147 sq. km of area washed away in floods"
findNumber(example)
输出:'8853147'
def findPhoneNumber(text):
line = re.findall(r"\d{10}", text)
return "".join(line)
findPhoneNumber("9990001796 is a phone number of PMO office")
输出:'9990001796'
def findNonalp(text):
line = re.findall("[^A-Za-z0-9 ]", text)
return line
example = "Twitter has lots of @ and # in posts.(2021 year is not good)"
findNonalp(example)
输出:['@', '#', '.', '(', ')']
def findYear(text):
line = re.findall(r"\d{4}", text)
return line
example = "My DOB year is 1998."
findYear(example)
输出:['1998']
def find_punct(text):
line = re.findall(r'[!"$%&'()*+,-./:;=#@?[\]^_`{|}~]*', text)
string = "".join(line)
return list(string)
example = "Corona virus killed #24506 people. #Corona is un(tolerable)"
print(find_punct(example))
输出:['#', '.', '#', '(', ')']
def rep(text):
grp = text.group(0)
if len(grp) > 1:
return grp[0:1]
def unique_char(rep, sentence):
convert = re.sub(r'(\w)\1+', rep, sentence)
return convert
example = "heyyy this is a verrrry loong texttt"
unique_char(rep, example)
输出:'hey this is a very long text'
def num_great(text):
line = re.findall(r'9[3-9][0-9]|[1-9]\d{3,}', text)
return " ".join(line)
example = "Height of this bridge is 935m. Width of this bridge is 30 metre. It used 9274kg of steel."
num_great(example)
输出:'935 9274'
def num_less(text):
only_num = []
for i in text.split():
line = re.findall(r'^(9[0-2][0-9]|[1-8][0-9]{2}|[1-9][0-9]|[0-9])$', i)
only_num.append(line)
all_num = [",".join(x) for x in only_num if x != []]
return " ".join(all_num)
example = "There are some countries where less than 920 cases exist with 1100 observations"
num_less(example)
输出:'920'
def findDates(text):
line = re.findall(r'(\d{1,2}/\d{1,2}/\d{4})', text)
return line
example = "Today's date is 06/21/2021 for format mm/dd/yyyy, not 31/09/2020"
findDates(example)
输出:['06/21/2021']
def onlyWords(text):
line = re.findall(r'\b\w+\b', text)
return " ".join(line)
example = "Harish reduced his weight from 100 Kg to 75 kg."
onlyWords(example)
输出:'Harish reduced his weight from Kg to kg.'
def only_numbers(text):
line = re.findall(r'\d+', text)
return " ".join(line)
example = "Harish reduced his weight from 100 Kg to 75 kg."
only_numbers(example)
输出:'100 75'
def pick_only_key_sentence(text, keyword):
line = re.findall(r'([^.]*' + keyword + '[^.]*)', text)
return line
example = "People are fighting with covid these days. Economy has fallen down. How will we survive covid"
pick_only_key_sentence(example, 'covid')
输出:['People are fighting with covid these days', 'How will we survive covid']
def find_capital(text):
line = re.findall(r'[A-Z]\w*', text)
return line
example = "Ajit Doval is the best National Security Advisor so far."
find_capital(example)
输出:['Ajit', 'Doval', 'National', 'Security', 'Advisor']
def remove_tag(string):
text = re.sub('<.*?>', '', string)
return text
example = "Markdown sentences use for breaks and for italics"
remove_tag(example)
输出:'Markdown sentences use for breaks and for italics'
def ip_add(string):
text = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', string)
return text
example = "My public IP address is 165.19.120.1"
ip_add(example)
输出:['165.19.120.1']
def mac_add(string):
text = re.findall(r'(?:[0-9a-fA-F]:?){12}', string)
return text
example = "MAC ADDRESSES of this TOSHIBA laptop is 00:21:27:b1:aa:xx."
mac_add(example)
输出:['00:21:27:b1:aa:xx']
def validPan(string):
text = re.findall(r'^[A-Z]{5}\d{4}[A-Z]{1}$', string)
if text != []:
print("{} is valid PAN number".format(string))
else:
print("{} is not a valid PAN number".format(string))
validPan("ABCED3193P")
validPan("lEcGD012eg")
def find_percent(string):
text = re.findall(r'(100|\d{1,2})%', string)
return text
example = "COVID recovery rate is now 76%. But death rate is 4%"
find_percent(example)
def find_files(string):
text = re.findall(r'([a-zA-Z0-9_]+)\.(jpg|png|gif|jpeg|pdf|ipynb|py)', string)
all_files = []
for i in range(len(text)):
all_files.append('.'.join(text[i]))
return all_files
example = "This image file name is cheatsheet.png. Titanic.py file is most common among beginners."
find_files(example)