Siddharth63
/

pubmedul2-tiny-nl6

+# %pip install sentencepiece
+# %pip install datasets
+import unicodedata
+import os
+import nltk
+from tqdm import tqdm
+import glob
+from random import sample
+def sample_and_make_tempfile(sentences_dir, num_files):
+    """ Use the set of files containing a sentence per line,
+    sample num_files out of those and save as a temp file """
+    sentence_files = glob.glob(sentences_dir + "/*.txt")
+    # sample num_files
+    sampled_files=sample(sentence_files, num_files)
+    print("sampled files:")
+    print(sampled_files)
+    #read all the lines from sampled files and save to a list
+    all_lines = []
+    for filename in sampled_files:
+        with open(filename) as f:
+            lines = f.read().splitlines()
+        all_lines.extend(lines)
+    print("number of lines sampled:", len(all_lines))
+    #combine into a single file and save
+    tempfile_path = os.path.join("text", "temp.txt")
+    with open(tempfile_path, "w") as f:
+                for sentence in tqdm(all_lines):
+                    # remove newlines
+                    line = sentence.strip()
+                    # do not save empty items such as
+                    if sentence != []:
+                        f.writelines(sentence + '\n')
+    print("Wrote to ", tempfile_path)
+    return tempfile_path
+def chunks(sentences, n, tot_len):
+    """Yield successive n-sized chunks from sentences."""
+    for i in range(0, tot_len, n):
+        end_i = min(len(sentences),i + n)
+        yield sentences[i:end_i]["text"]
+def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'):
+    """
+    Make a sentence per line files, chuncsize sentences per file"""
+    # make sure data dir exists
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    # use simple regex for sentence tokenizing
+    sent_detector = nltk.RegexpTokenizer(u'[^　！？。]*[！？。.\n]')
+    # loop over the chunks
+    for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))):
+        # new file for each chunk
+        filename = "sent_{}.txt".format(chunk_ind)
+        filepath = os.path.join(data_dir, filename)
+        print("writing to ", filepath)
+        with open(filepath, "w") as f:
+            for sentence in tqdm(sentence_chunk):
+                # remove newlines
+                line = sentence.strip()
+                # unicode normalize japanese spaces etc
+                unicodedata.normalize('NFKC', line)
+                # tokenize into sentences
+                sentences = sent_detector.tokenize(line)
+                # do not save empty items such as
+                if sentences != []:
+                    f.writelines(s + '\n' for s in sentences)
+def combine_files(output_file, *files):
+    """
+    Combines the contents of multiple text files into a single file.
+    :param output_file: Path to the output file.
+    :param files: Paths to the files to be combined.
+    :return: Total number of lines in the combined file.
+    """
+    total_lines = 0
+    with open(output_file, 'w') as outfile:
+        for file in files:
+            with open(file, 'r') as infile:
+                lines = infile.readlines()
+                total_lines += len(lines)
+                outfile.writelines(lines)
+                # Add a newline for separation (optional)
+                outfile.write('\n')
+    return total_lines
+# make sentence files from hugingface dataset
+dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset")
+make_sentence_files(dataset_bio["train"])
+# combine files to get 45 million sentences
+files_to_combine = glob.glob("text/sentences/*.txt")
+files_to_combine = files_to_combine[:2]
+total_lines = combine_files(output_file_path, *files_to_combine)
+# Train the sentencepiece transformers on 45 million sentences
+import sentencepiece as spm
+spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
+                                pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
+                                user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
+                                train_extremely_large_corpus=True,
+                                num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)