Skip to content
  • Categories
  • Recent
  • Tags
  • Popular
  • World
  • Users
  • Groups
Skins
  • Light
  • Cerulean
  • Cosmo
  • Flatly
  • Journal
  • Litera
  • Lumen
  • Lux
  • Materia
  • Minty
  • Morph
  • Pulse
  • Sandstone
  • Simplex
  • Sketchy
  • Spacelab
  • United
  • Yeti
  • Zephyr
  • Dark
  • Cyborg
  • Darkly
  • Quartz
  • Slate
  • Solar
  • Superhero
  • Vapor

  • Default (No Skin)
  • No Skin
Collapse
Code Project
  1. Home
  2. General Programming
  3. C#
  4. spliting sentence on the basis of conjunctions

spliting sentence on the basis of conjunctions

Scheduled Pinned Locked Moved C#
helpquestion
8 Posts 5 Posters 0 Views 1 Watching
  • Oldest to Newest
  • Newest to Oldest
  • Most Votes
Reply
  • Reply as topic
Log in to reply
This topic has been deleted. Only users with topic management privileges can see it.
  • K Offline
    K Offline
    KhanKtk
    wrote on last edited by
    #1

    Here is what i did so far. The problem is if a conjunction appears twice in the sentence the code doesnt work for the 2nd appearance of the conjunction. plz if any expert can help ? private void SplitSentence_Click(object sender, EventArgs e) { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] keywords = { " or ", " and ", " hence", "so that", "however", " because" }; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; foreach (string sentence in sentences) { remSentence = sentence; richTextBox3.Text = remSentence; for (int i =0; i < keywords.Length; i++) { if ((remSentence.Contains(keywords[i])))// || (remSentence.IndexOf(keywords[i]) > 0)) { richTextBox2.Text += remSentence.Substring(0, remSentence.IndexOf(keywords[i])) + '\n' + keywords[i] + '\n'; remSentence = remSentence.Substring(remSentence.IndexOf(keywords[i]) + keywords[i].Length); } } richTextBox2.Text += remSentence; } } public static string[] SentenceTokenizer(string text) { char[] sentdelimiters = new char[] { '.', '?', '۔', '؟', '\r', ':', '-' }; // '{ ',' }', '( ', ' )', ' [', ']', '>', '<','-', '_', '= ', '+','|', '\\', ':', ';', ' ', '\'', ',', '.', '/', '?', '~', '!','@', '#', '$', '%', '^', '&', '*', ' ', '\r', '\n', '\t'}; // text.Remove('\n'); return text.Split(sentdelimiters, StringSplitOptions.RemoveEmptyEntries); }

    Kornfeld Eliyahu PeterK B L 3 Replies Last reply
    0
    • K KhanKtk

      Here is what i did so far. The problem is if a conjunction appears twice in the sentence the code doesnt work for the 2nd appearance of the conjunction. plz if any expert can help ? private void SplitSentence_Click(object sender, EventArgs e) { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] keywords = { " or ", " and ", " hence", "so that", "however", " because" }; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; foreach (string sentence in sentences) { remSentence = sentence; richTextBox3.Text = remSentence; for (int i =0; i < keywords.Length; i++) { if ((remSentence.Contains(keywords[i])))// || (remSentence.IndexOf(keywords[i]) > 0)) { richTextBox2.Text += remSentence.Substring(0, remSentence.IndexOf(keywords[i])) + '\n' + keywords[i] + '\n'; remSentence = remSentence.Substring(remSentence.IndexOf(keywords[i]) + keywords[i].Length); } } richTextBox2.Text += remSentence; } } public static string[] SentenceTokenizer(string text) { char[] sentdelimiters = new char[] { '.', '?', '۔', '؟', '\r', ':', '-' }; // '{ ',' }', '( ', ' )', ' [', ']', '>', '<','-', '_', '= ', '+','|', '\\', ':', ';', ' ', '\'', ',', '.', '/', '?', '~', '!','@', '#', '$', '%', '^', '&', '*', ' ', '\r', '\n', '\t'}; // text.Remove('\n'); return text.Split(sentdelimiters, StringSplitOptions.RemoveEmptyEntries); }

      Kornfeld Eliyahu PeterK Offline
      Kornfeld Eliyahu PeterK Offline
      Kornfeld Eliyahu Peter
      wrote on last edited by
      #2

      I think you have to learn some regex, it can help you out... http://www.regular-expressions.info/[^] http://regex.learncodethehardway.org/[^]

      I'm not questioning your powers of observation; I'm merely remarking upon the paradox of asking a masked man who he is. (V)

      "It never ceases to amaze me that a spacecraft launched in 1977 can be fixed remotely from Earth." ― Brian Cox

      K 1 Reply Last reply
      0
      • K KhanKtk

        Here is what i did so far. The problem is if a conjunction appears twice in the sentence the code doesnt work for the 2nd appearance of the conjunction. plz if any expert can help ? private void SplitSentence_Click(object sender, EventArgs e) { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] keywords = { " or ", " and ", " hence", "so that", "however", " because" }; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; foreach (string sentence in sentences) { remSentence = sentence; richTextBox3.Text = remSentence; for (int i =0; i < keywords.Length; i++) { if ((remSentence.Contains(keywords[i])))// || (remSentence.IndexOf(keywords[i]) > 0)) { richTextBox2.Text += remSentence.Substring(0, remSentence.IndexOf(keywords[i])) + '\n' + keywords[i] + '\n'; remSentence = remSentence.Substring(remSentence.IndexOf(keywords[i]) + keywords[i].Length); } } richTextBox2.Text += remSentence; } } public static string[] SentenceTokenizer(string text) { char[] sentdelimiters = new char[] { '.', '?', '۔', '؟', '\r', ':', '-' }; // '{ ',' }', '( ', ' )', ' [', ']', '>', '<','-', '_', '= ', '+','|', '\\', ':', ';', ' ', '\'', ',', '.', '/', '?', '~', '!','@', '#', '$', '%', '^', '&', '*', ' ', '\r', '\n', '\t'}; // text.Remove('\n'); return text.Split(sentdelimiters, StringSplitOptions.RemoveEmptyEntries); }

        B Offline
        B Offline
        Bernhard Hiller
        wrote on last edited by
        #3

        if ((remSentence.Contains(keywords[i])))That's the position your trouble starts: you execute it once only regardless of the number of occurrences. You'll better use a function to split the sentence, and apply that function recursively on the resulting sub-sentences. And yes: Regular Expressions are preferred.

        K 2 Replies Last reply
        0
        • B Bernhard Hiller

          if ((remSentence.Contains(keywords[i])))That's the position your trouble starts: you execute it once only regardless of the number of occurrences. You'll better use a function to split the sentence, and apply that function recursively on the resulting sub-sentences. And yes: Regular Expressions are preferred.

          K Offline
          K Offline
          KhanKtk
          wrote on last edited by
          #4

          here is what i did using regex. it works well. But doing this way, Splitting with regex, i lost the control over the word "and" for further processing. I have a lexicon of 20 words that normally appears before the "and (اور)" in urdu language. In next step I want to have a way to check the word before "and" against the lexicon and if found the sentence is broken else display the complete sentence. private void button1_Click(object sender, EventArgs e) { { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; // these are urdu conjunctions. i am actually working on urdu language. Regex r = new Regex("(کہ |اور | تاکہ| مگر | تاہم | کیونکہ | لیکن )"); foreach (string sentence in sentences) { remSentence = sentence; remSentence = r.Replace(remSentence, "|"); string[] phrases = remSentence.Split('|'); for (int i = 0; i < phrases.Length; i++) { richTextBox2.Text += phrases[i] + '\n'; } } } }

          1 Reply Last reply
          0
          • B Bernhard Hiller

            if ((remSentence.Contains(keywords[i])))That's the position your trouble starts: you execute it once only regardless of the number of occurrences. You'll better use a function to split the sentence, and apply that function recursively on the resulting sub-sentences. And yes: Regular Expressions are preferred.

            K Offline
            K Offline
            KhanKtk
            wrote on last edited by
            #5

            would you plz share some coded modification?

            B 1 Reply Last reply
            0
            • K KhanKtk

              would you plz share some coded modification?

              B Offline
              B Offline
              Bernhard
              wrote on last edited by
              #6

              I am not Bernhard Hiller.


              All the label says is that this stuff contains chemicals "... known to the State of California to cause cancer in rats and low-income test subjects."
              Roger Wright
              http://www.codeproject.com/lounge.asp?select=965687&exp=5&fr=1#xx965687xx

              1 Reply Last reply
              0
              • Kornfeld Eliyahu PeterK Kornfeld Eliyahu Peter

                I think you have to learn some regex, it can help you out... http://www.regular-expressions.info/[^] http://regex.learncodethehardway.org/[^]

                I'm not questioning your powers of observation; I'm merely remarking upon the paradox of asking a masked man who he is. (V)

                K Offline
                K Offline
                KhanKtk
                wrote on last edited by
                #7

                here is what i did using regex. it works well. But doing this way, Splitting with regex, i lost the control over the word "and" for further processing. I have a lexicon of 20 words that normally appears before the "and (اور)" in urdu language. In next step I want to have a way to check the word before "and" against the lexicon and if found the sentence is broken else display the complete sentence. private void button1_Click(object sender, EventArgs e) { { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; // these are urdu conjunctions. i am actually working on urdu language. Regex r = new Regex("(کہ |اور | تاکہ| مگر | تاہم | کیونکہ | لیکن )"); foreach (string sentence in sentences) { remSentence = sentence; remSentence = r.Replace(remSentence, "|"); string[] phrases = remSentence.Split('|'); for (int i = 0; i < phrases.Length; i++) { richTextBox2.Text += phrases[i] + '\n'; } } } }

                1 Reply Last reply
                0
                • K KhanKtk

                  Here is what i did so far. The problem is if a conjunction appears twice in the sentence the code doesnt work for the 2nd appearance of the conjunction. plz if any expert can help ? private void SplitSentence_Click(object sender, EventArgs e) { richTextBox2.Text = ""; richTextBox3.Text = ""; string[] keywords = { " or ", " and ", " hence", "so that", "however", " because" }; string[] sentences = SentenceTokenizer(richTextBox1.Text); string remSentence; foreach (string sentence in sentences) { remSentence = sentence; richTextBox3.Text = remSentence; for (int i =0; i < keywords.Length; i++) { if ((remSentence.Contains(keywords[i])))// || (remSentence.IndexOf(keywords[i]) > 0)) { richTextBox2.Text += remSentence.Substring(0, remSentence.IndexOf(keywords[i])) + '\n' + keywords[i] + '\n'; remSentence = remSentence.Substring(remSentence.IndexOf(keywords[i]) + keywords[i].Length); } } richTextBox2.Text += remSentence; } } public static string[] SentenceTokenizer(string text) { char[] sentdelimiters = new char[] { '.', '?', '۔', '؟', '\r', ':', '-' }; // '{ ',' }', '( ', ' )', ' [', ']', '>', '<','-', '_', '= ', '+','|', '\\', ':', ';', ' ', '\'', ',', '.', '/', '?', '~', '!','@', '#', '$', '%', '^', '&', '*', ' ', '\r', '\n', '\t'}; // text.Remove('\n'); return text.Split(sentdelimiters, StringSplitOptions.RemoveEmptyEntries); }

                  L Offline
                  L Offline
                  Lost User
                  wrote on last edited by
                  #8

                  Something like the below will loop for each keyword until there are no more matches left for that keyword - but it wont be perfect because it will skip over other keywords while it's looking e.g "if you had this and that or the other and something" you'd strip out all the Ands before stripping out the Or What you need to do is Repeat Find the first occurrence of ANY of the keywords in your sentence. Split the sentence Until no occurrences found Using the IndexOf method you can loop through your keywords, finding the lowest, non zero value of IndexOf and storing that word. When the loop finishes , split the sentence using that word.

                  bool finished = false;
                  while (not finished)
                  if ((remSentence.Contains(keywords[i])))// || (remSentence.IndexOf(keywords[i]) > 0))
                  {
                  richTextBox2.Text += remSentence.Substring(0, remSentence.IndexOf(keywords[i])) + '\n' + keywords[i] + '\n';
                  remSentence = remSentence.Substring(remSentence.IndexOf(keywords[i]) + keywords[i].Length);

                  }
                  else
                  {
                  finished = true;
                  }

                  1 Reply Last reply
                  0
                  Reply
                  • Reply as topic
                  Log in to reply
                  • Oldest to Newest
                  • Newest to Oldest
                  • Most Votes


                  • Login

                  • Don't have an account? Register

                  • Login or register to search.
                  • First post
                    Last post
                  0
                  • Categories
                  • Recent
                  • Tags
                  • Popular
                  • World
                  • Users
                  • Groups