Finding exact words in text, excluding quoted words
In the javascript code below I need to find in a text exact words, but excluding the words that are between quotes. This is my attempt, what's wrong with the regex? It should find all the words excluding word22 and "word3". If I use only b in the regex it selects exact words but it doesn't exclude the words between quotes.
var text = 'word1, word2, word22, "word3" and word4';
var words = [ 'word1', 'word2', 'word3' , 'word4' ];
words.forEach(function(word){
var re = new RegExp('\b^"' + word + '^"\b', 'i');
var pos = text.search(re);
if (pos > -1)
alert(word + " found in position " + pos);
});
javascript regex
add a comment |
In the javascript code below I need to find in a text exact words, but excluding the words that are between quotes. This is my attempt, what's wrong with the regex? It should find all the words excluding word22 and "word3". If I use only b in the regex it selects exact words but it doesn't exclude the words between quotes.
var text = 'word1, word2, word22, "word3" and word4';
var words = [ 'word1', 'word2', 'word3' , 'word4' ];
words.forEach(function(word){
var re = new RegExp('\b^"' + word + '^"\b', 'i');
var pos = text.search(re);
if (pos > -1)
alert(word + " found in position " + pos);
});
javascript regex
add a comment |
In the javascript code below I need to find in a text exact words, but excluding the words that are between quotes. This is my attempt, what's wrong with the regex? It should find all the words excluding word22 and "word3". If I use only b in the regex it selects exact words but it doesn't exclude the words between quotes.
var text = 'word1, word2, word22, "word3" and word4';
var words = [ 'word1', 'word2', 'word3' , 'word4' ];
words.forEach(function(word){
var re = new RegExp('\b^"' + word + '^"\b', 'i');
var pos = text.search(re);
if (pos > -1)
alert(word + " found in position " + pos);
});
javascript regex
In the javascript code below I need to find in a text exact words, but excluding the words that are between quotes. This is my attempt, what's wrong with the regex? It should find all the words excluding word22 and "word3". If I use only b in the regex it selects exact words but it doesn't exclude the words between quotes.
var text = 'word1, word2, word22, "word3" and word4';
var words = [ 'word1', 'word2', 'word3' , 'word4' ];
words.forEach(function(word){
var re = new RegExp('\b^"' + word + '^"\b', 'i');
var pos = text.search(re);
if (pos > -1)
alert(word + " found in position " + pos);
});
javascript regex
javascript regex
edited Nov 28 '18 at 3:30
ps0604
asked Nov 28 '18 at 3:22
ps0604ps0604
572947136
572947136
add a comment |
add a comment |
2 Answers
2
active
oldest
votes
First, we'll use a function to escape the characters of the word, just in case there's some that have special meaning for regexp.
// from https://stackoverflow.com/a/30851002/240443
function regExpEscape(literal_string) {
return literal_string.replace(/[-[]{}()*+!<=:?./\^$|#s,]/g, '\$&');
}
Then, we construct a regular expression as an alternation between individual word regexps. For each word, we assert that it starts with a word boundary, ends with a word boundary, and has an even number of quote characters between its end, and the end of string. (Note that from the end of word3, there is only one quote till the end of string, which is odd.)
let text = 'word1, word2, word22, "word3" and word4';
let words = [ 'word1', 'word2', 'word3' , 'word4' ];
let regexp = new RegExp(words.map(word =>
'\b' + regExpEscape(word) + '\b(?=(?:[^"]*"[^"]*")*[^"]*$)').join('|'), 'g')
text.match(regexp)
// => word1, word2, word4
while ((m = regexp.exec(text))) {
console.log(m[0], m.index);
}
// word1 0
// word2 7
// word4 34
EDIT: Actually, we can speed the regexp up a bit if we factor out the surrounding conditions:
let regexp = new RegExp(
'\b(?:' +
words.map(regExpEscape).join('|') +
')\b(?=(?:[^"]*"[^"]*")*[^"]*$)', 'g')
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like"word2orword2"where it starts or ends with a quotation mark, but isn't surrounded by one.
– Matti Price
Nov 28 '18 at 4:14
add a comment |
Your excluding of the quote character is wrong, that's actually matching the beginning of the string followed by a quote. Trying this instead
var re = new RegExp('\b[^"]' + word + '[^"]\b', 'i');
Also, this site is amazing to help you debug regex : https://regexpal.com
Edit: Because b will match on quotation marks, this needs to be tweaked further. Unfortunately javascript doesn't support lookbehinds, so we have to get a little tricky.
var re = new RegExp('(?:^|[^"\w])' + word + '(?:$|[^"\w])','i')
So what this is doing is saying
(?: Don't capture this group
^ | [^"w]) either match the start of the line, or any non word (alphanumeric and underscore) character that isn't a quote
word capture and match your word here
(?: Don't capture this group either
$|[^"w) either match the end of the line, or any non word character that isn't a quote again
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seekingword2, you'd only find it if the string containedbword2eor similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.
– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53511629%2ffinding-exact-words-in-text-excluding-quoted-words%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
First, we'll use a function to escape the characters of the word, just in case there's some that have special meaning for regexp.
// from https://stackoverflow.com/a/30851002/240443
function regExpEscape(literal_string) {
return literal_string.replace(/[-[]{}()*+!<=:?./\^$|#s,]/g, '\$&');
}
Then, we construct a regular expression as an alternation between individual word regexps. For each word, we assert that it starts with a word boundary, ends with a word boundary, and has an even number of quote characters between its end, and the end of string. (Note that from the end of word3, there is only one quote till the end of string, which is odd.)
let text = 'word1, word2, word22, "word3" and word4';
let words = [ 'word1', 'word2', 'word3' , 'word4' ];
let regexp = new RegExp(words.map(word =>
'\b' + regExpEscape(word) + '\b(?=(?:[^"]*"[^"]*")*[^"]*$)').join('|'), 'g')
text.match(regexp)
// => word1, word2, word4
while ((m = regexp.exec(text))) {
console.log(m[0], m.index);
}
// word1 0
// word2 7
// word4 34
EDIT: Actually, we can speed the regexp up a bit if we factor out the surrounding conditions:
let regexp = new RegExp(
'\b(?:' +
words.map(regExpEscape).join('|') +
')\b(?=(?:[^"]*"[^"]*")*[^"]*$)', 'g')
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like"word2orword2"where it starts or ends with a quotation mark, but isn't surrounded by one.
– Matti Price
Nov 28 '18 at 4:14
add a comment |
First, we'll use a function to escape the characters of the word, just in case there's some that have special meaning for regexp.
// from https://stackoverflow.com/a/30851002/240443
function regExpEscape(literal_string) {
return literal_string.replace(/[-[]{}()*+!<=:?./\^$|#s,]/g, '\$&');
}
Then, we construct a regular expression as an alternation between individual word regexps. For each word, we assert that it starts with a word boundary, ends with a word boundary, and has an even number of quote characters between its end, and the end of string. (Note that from the end of word3, there is only one quote till the end of string, which is odd.)
let text = 'word1, word2, word22, "word3" and word4';
let words = [ 'word1', 'word2', 'word3' , 'word4' ];
let regexp = new RegExp(words.map(word =>
'\b' + regExpEscape(word) + '\b(?=(?:[^"]*"[^"]*")*[^"]*$)').join('|'), 'g')
text.match(regexp)
// => word1, word2, word4
while ((m = regexp.exec(text))) {
console.log(m[0], m.index);
}
// word1 0
// word2 7
// word4 34
EDIT: Actually, we can speed the regexp up a bit if we factor out the surrounding conditions:
let regexp = new RegExp(
'\b(?:' +
words.map(regExpEscape).join('|') +
')\b(?=(?:[^"]*"[^"]*")*[^"]*$)', 'g')
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like"word2orword2"where it starts or ends with a quotation mark, but isn't surrounded by one.
– Matti Price
Nov 28 '18 at 4:14
add a comment |
First, we'll use a function to escape the characters of the word, just in case there's some that have special meaning for regexp.
// from https://stackoverflow.com/a/30851002/240443
function regExpEscape(literal_string) {
return literal_string.replace(/[-[]{}()*+!<=:?./\^$|#s,]/g, '\$&');
}
Then, we construct a regular expression as an alternation between individual word regexps. For each word, we assert that it starts with a word boundary, ends with a word boundary, and has an even number of quote characters between its end, and the end of string. (Note that from the end of word3, there is only one quote till the end of string, which is odd.)
let text = 'word1, word2, word22, "word3" and word4';
let words = [ 'word1', 'word2', 'word3' , 'word4' ];
let regexp = new RegExp(words.map(word =>
'\b' + regExpEscape(word) + '\b(?=(?:[^"]*"[^"]*")*[^"]*$)').join('|'), 'g')
text.match(regexp)
// => word1, word2, word4
while ((m = regexp.exec(text))) {
console.log(m[0], m.index);
}
// word1 0
// word2 7
// word4 34
EDIT: Actually, we can speed the regexp up a bit if we factor out the surrounding conditions:
let regexp = new RegExp(
'\b(?:' +
words.map(regExpEscape).join('|') +
')\b(?=(?:[^"]*"[^"]*")*[^"]*$)', 'g')
First, we'll use a function to escape the characters of the word, just in case there's some that have special meaning for regexp.
// from https://stackoverflow.com/a/30851002/240443
function regExpEscape(literal_string) {
return literal_string.replace(/[-[]{}()*+!<=:?./\^$|#s,]/g, '\$&');
}
Then, we construct a regular expression as an alternation between individual word regexps. For each word, we assert that it starts with a word boundary, ends with a word boundary, and has an even number of quote characters between its end, and the end of string. (Note that from the end of word3, there is only one quote till the end of string, which is odd.)
let text = 'word1, word2, word22, "word3" and word4';
let words = [ 'word1', 'word2', 'word3' , 'word4' ];
let regexp = new RegExp(words.map(word =>
'\b' + regExpEscape(word) + '\b(?=(?:[^"]*"[^"]*")*[^"]*$)').join('|'), 'g')
text.match(regexp)
// => word1, word2, word4
while ((m = regexp.exec(text))) {
console.log(m[0], m.index);
}
// word1 0
// word2 7
// word4 34
EDIT: Actually, we can speed the regexp up a bit if we factor out the surrounding conditions:
let regexp = new RegExp(
'\b(?:' +
words.map(regExpEscape).join('|') +
')\b(?=(?:[^"]*"[^"]*")*[^"]*$)', 'g')
edited Nov 28 '18 at 4:20
answered Nov 28 '18 at 3:55
AmadanAmadan
132k13145197
132k13145197
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like"word2orword2"where it starts or ends with a quotation mark, but isn't surrounded by one.
– Matti Price
Nov 28 '18 at 4:14
add a comment |
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like"word2orword2"where it starts or ends with a quotation mark, but isn't surrounded by one.
– Matti Price
Nov 28 '18 at 4:14
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like
"word2 or word2" where it starts or ends with a quotation mark, but isn't surrounded by one.– Matti Price
Nov 28 '18 at 4:14
This is probably the better solution because it balances the quotes around something. Mine wouldn't match something like
"word2 or word2" where it starts or ends with a quotation mark, but isn't surrounded by one.– Matti Price
Nov 28 '18 at 4:14
add a comment |
Your excluding of the quote character is wrong, that's actually matching the beginning of the string followed by a quote. Trying this instead
var re = new RegExp('\b[^"]' + word + '[^"]\b', 'i');
Also, this site is amazing to help you debug regex : https://regexpal.com
Edit: Because b will match on quotation marks, this needs to be tweaked further. Unfortunately javascript doesn't support lookbehinds, so we have to get a little tricky.
var re = new RegExp('(?:^|[^"\w])' + word + '(?:$|[^"\w])','i')
So what this is doing is saying
(?: Don't capture this group
^ | [^"w]) either match the start of the line, or any non word (alphanumeric and underscore) character that isn't a quote
word capture and match your word here
(?: Don't capture this group either
$|[^"w) either match the end of the line, or any non word character that isn't a quote again
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seekingword2, you'd only find it if the string containedbword2eor similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.
– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
add a comment |
Your excluding of the quote character is wrong, that's actually matching the beginning of the string followed by a quote. Trying this instead
var re = new RegExp('\b[^"]' + word + '[^"]\b', 'i');
Also, this site is amazing to help you debug regex : https://regexpal.com
Edit: Because b will match on quotation marks, this needs to be tweaked further. Unfortunately javascript doesn't support lookbehinds, so we have to get a little tricky.
var re = new RegExp('(?:^|[^"\w])' + word + '(?:$|[^"\w])','i')
So what this is doing is saying
(?: Don't capture this group
^ | [^"w]) either match the start of the line, or any non word (alphanumeric and underscore) character that isn't a quote
word capture and match your word here
(?: Don't capture this group either
$|[^"w) either match the end of the line, or any non word character that isn't a quote again
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seekingword2, you'd only find it if the string containedbword2eor similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.
– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
add a comment |
Your excluding of the quote character is wrong, that's actually matching the beginning of the string followed by a quote. Trying this instead
var re = new RegExp('\b[^"]' + word + '[^"]\b', 'i');
Also, this site is amazing to help you debug regex : https://regexpal.com
Edit: Because b will match on quotation marks, this needs to be tweaked further. Unfortunately javascript doesn't support lookbehinds, so we have to get a little tricky.
var re = new RegExp('(?:^|[^"\w])' + word + '(?:$|[^"\w])','i')
So what this is doing is saying
(?: Don't capture this group
^ | [^"w]) either match the start of the line, or any non word (alphanumeric and underscore) character that isn't a quote
word capture and match your word here
(?: Don't capture this group either
$|[^"w) either match the end of the line, or any non word character that isn't a quote again
Your excluding of the quote character is wrong, that's actually matching the beginning of the string followed by a quote. Trying this instead
var re = new RegExp('\b[^"]' + word + '[^"]\b', 'i');
Also, this site is amazing to help you debug regex : https://regexpal.com
Edit: Because b will match on quotation marks, this needs to be tweaked further. Unfortunately javascript doesn't support lookbehinds, so we have to get a little tricky.
var re = new RegExp('(?:^|[^"\w])' + word + '(?:$|[^"\w])','i')
So what this is doing is saying
(?: Don't capture this group
^ | [^"w]) either match the start of the line, or any non word (alphanumeric and underscore) character that isn't a quote
word capture and match your word here
(?: Don't capture this group either
$|[^"w) either match the end of the line, or any non word character that isn't a quote again
edited Nov 28 '18 at 4:02
answered Nov 28 '18 at 3:31
Matti PriceMatti Price
2,076922
2,076922
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seekingword2, you'd only find it if the string containedbword2eor similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.
– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
add a comment |
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seekingword2, you'd only find it if the string containedbword2eor similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.
– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
it's not working, see here regexpal.com/?fam=106212
– ps0604
Nov 28 '18 at 3:37
If seeking
word2, you'd only find it if the string contained bword2e or similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.– Amadan
Nov 28 '18 at 3:37
If seeking
word2, you'd only find it if the string contained bword2e or similar, as your "not a quote" assertions are not null-width, and will have to consume a character each.– Amadan
Nov 28 '18 at 3:37
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
since javascript doesn't support lookbehinds, this is a little more annoying, but see if that update works for you @ps0604
– Matti Price
Nov 28 '18 at 4:03
yes, your update works
– ps0604
Nov 28 '18 at 4:09
yes, your update works
– ps0604
Nov 28 '18 at 4:09
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53511629%2ffinding-exact-words-in-text-excluding-quoted-words%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown