gen eval dataset

2026-03-07 14:24:44 +00:00 · 2023-09-15 16:45:46 -05:00
parent f5c68cb0b6
commit eceaaf52ce
4 changed files with 1132 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .idea
+.ipynb_checkpoints
 .DS_Store
--- a/anima_100k/extened_longchat_topiced_conversations.json
+++ b/anima_100k/extened_longchat_topiced_conversations.json
--- a/anima_100k/gen_longchat_lines_retrieval_eval_dataset.ipynb
+++ b/anima_100k/gen_longchat_lines_retrieval_eval_dataset.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6d8683b7-0fab-4937-b7ad-72d70a0260ac",
+   "metadata": {},
+   "source": [
+    "# tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d33c6c4e-9fd7-4850-8800-12ac35a867a0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: wonderwords in /usr/local/anaconda3/envs/ghostaienv/lib/python3.8/site-packages (2.2.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install wonderwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "591305e5-0459-4f0b-9968-f77d207d0172",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os, json\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3cca5b71-75c5-44bc-9e80-330c93915f3d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import LlamaTokenizer\n",
+    "import torch\n",
+    "\n",
+    "base_model = \"huggyllama/llama-13b\"\n",
+    "tokenizer = LlamaTokenizer.from_pretrained(base_model,\n",
+    "                                          )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "041ff3ce-d593-4f7d-be0e-c5488aeb9156",
+   "metadata": {},
+   "source": [
+    "# gen topic eval dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "97a48cc7-7c41-4e7d-96ca-4771472a3e81",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "np.random.seed(42) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1409125a-2030-4067-9a33-8612c4cd668b",
+   "metadata": {},
+   "source": [
+    "# gen lines eval dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4bf7c52c-ce66-483e-9a6a-c6067d1dbdeb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "def generate_line_index(num_line, idx_opt):\n",
+    "    if idx_opt == \"LRT-ABCindex\":\n",
+    "        ingredients = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n",
+    "\n",
+    "        start = 6\n",
+    "        comb = list(itertools.product(ingredients, repeat=start))\n",
+    "        while len(comb) < num_line:\n",
+    "            start += 1\n",
+    "            comb = list(itertools.product(ingredients, repeat=start))\n",
+    "        \n",
+    "        comb = [\"\".join(i) for i in comb]\n",
+    "\n",
+    "        return comb[:num_line]\n",
+    "    elif idx_opt == \"LRT-UUID\":\n",
+    "        comb = []\n",
+    "        for i in range(num_line):\n",
+    "            comb.append(str(uuid.uuid4()))\n",
+    "        \n",
+    "        return comb\n",
+    "    elif idx_opt == \"LRT-NL\":\n",
+    "        import wonderwords\n",
+    "\n",
+    "        w = wonderwords.RandomWord()\n",
+    "        adjs = w.random_words(num_line, include_categories=[\"noun\"])\n",
+    "        nouns = w.random_words(num_line, include_categories=[\"noun\"])\n",
+    "\n",
+    "        comb = []\n",
+    "        for i, (adj, noun) in enumerate(zip(adjs, nouns)):\n",
+    "            comb.append(f\"{adj}-{noun}\")\n",
+    "        \n",
+    "        return comb\n",
+    "    \n",
+    "def retrieve_expected(lines, random_line_pos):\n",
+    "    correct_line = lines[random_line_pos]\n",
+    "    expected_number = re.search(\"<\\d+>\", correct_line)\n",
+    "    if expected_number is not None:\n",
+    "        expected_number = int(expected_number.group()[1:-1])\n",
+    "    else:\n",
+    "        print(f\"Got unparsable line: {correct_line}\")\n",
+    "\n",
+    "    return expected_number, correct_line\n",
+    "\n",
+    "def generate_prompt_from_lines(lines):\n",
+    "    prompt = \"\"\n",
+    "    for l in lines:\n",
+    "        prompt += l\n",
+    "    \n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bbfe0587-b2ad-4334-88a7-5f8a62b17f30",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                                                                                                                                                | 0/20 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (92263 > 2048). Running this sequence through the model will result in indexing errors\n",
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.18it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "3687.8080000000004"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import random, re\n",
+    "\n",
+    "RECORD_COUNT = 20\n",
+    "\n",
+    "ROWS = [4000]\n",
+    "output_dir = \".\"\n",
+    "\n",
+    "avg_len = 0\n",
+    "\n",
+    "for n in ROWS:\n",
+    "    output_path = os.path.join(output_dir, f\"{n}_lines_en.jsonl\")\n",
+    "    f = open(output_path, \"w\", encoding=\"utf-8\")\n",
+    "\n",
+    "    for i in tqdm(list(range(RECORD_COUNT))):          \n",
+    "        prompt_header = \"Below is a record of lines I want you to remember. \" + \\\n",
+    "                        \"Each line begins with 'line <line index>' and contains \" + \\\n",
+    "                        \"a '<REGISTER_CONTENT>' at the end of the line as a numerical value. \" + \\\n",
+    "                        \"For each line index, memorize its corresponding <REGISTER_CONTENT>. At \" + \\\n",
+    "                        \"the end of the record, I will ask you to retrieve the corresponding \" + \\\n",
+    "                        \"<REGISTER_CONTENT> of a certain line index. Now the record start:\\n\\n\"\n",
+    "        lines = []\n",
+    "\n",
+    "        line_idx_opt = \"LRT-NL\"\n",
+    "\n",
+    "        if line_idx_opt == \"LRT\":\n",
+    "            line_idxes = list(range(1, n + 1))\n",
+    "            lines.extend([f\"line {i}: REGISTER_CONTENT is <{random.randint(1, 50000)}>\\n\" for i in line_idxes])\n",
+    "            random_idx = random.randint(1, n)\n",
+    "            random_num = random_idx - 1\n",
+    "        else:\n",
+    "            line_idxes = generate_line_index(n, line_idx_opt)\n",
+    "            lines.extend([f\"line {i}: REGISTER_CONTENT is <{random.randint(1, 50000)}>\\n\" for i in line_idxes])\n",
+    "            random_num = random.randint(0, len(line_idxes)-1)\n",
+    "            random_idx = line_idxes[random_num]\n",
+    "\n",
+    "        expected_number, correct_line = retrieve_expected(lines, random_num)\n",
+    "        lines.insert(0, f\"{prompt_header}\")\n",
+    "        prompt_postfix = f\"\\nNow the record is over. Tell me what is the <REGISTER_CONTENT> in line {random_idx}? I need the number.\"\n",
+    "\n",
+    "        prompt = generate_prompt_from_lines(lines)\n",
+    "\n",
+    "        prompt_len = len(tokenizer.encode(prompt))\n",
+    "\n",
+    "\n",
+    "        avg_len += prompt_len/500\n",
+    "\n",
+    "        \n",
+    "        output = {\n",
+    "            \"random_idx\": (random_idx, random_num), # this is the line to retrieve\n",
+    "            \"expected_number\": expected_number,\n",
+    "            \"num_lines\": n,\n",
+    "            \"prompt_len\":prompt_len,\n",
+    "            \"correct_line\": correct_line,\n",
+    "            \"prompt_postfix\": prompt_postfix,\n",
+    "            \"prompt\": prompt}\n",
+    "\n",
+    "        json.dump(output, f, ensure_ascii=False)\n",
+    "        f.write(\"\\n\")\n",
+    "    f.close()\n",
+    "\n",
+    "\n",
+    "avg_len"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3b3d86e-6da6-44f2-887a-ae1374961fa0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!head -n 1 {n}_lines_en.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cf86aebf-b78d-43bf-8aea-d9ced7676855",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      20 4000_lines_en.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wc -l {n}_lines_en.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dbb8bc4-8449-43d3-b32b-fe0072a815e7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/anima_100k/gen_longchat_topics_retrieval_eval_dataset_extended.ipynb
+++ b/anima_100k/gen_longchat_topics_retrieval_eval_dataset_extended.ipynb
@@ -0,0 +1,816 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "76a17e30",
+   "metadata": {},
+   "source": [
+    "# tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cd7c1605",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "from transformers import LlamaTokenizer\n",
+    "\n",
+    "\n",
+    "base_model = \"huggyllama/llama-13b\"\n",
+    "tokenizer = LlamaTokenizer.from_pretrained(base_model,\n",
+    "                                           \n",
+    "                                          )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bacd3e6",
+   "metadata": {},
+   "source": [
+    "# loop convs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "49be06a1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('extened_longchat_topiced_conversations.json', 'r', encoding='utf-8') as f:\n",
+    "    conv_list = json.load(f)\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9a7c08a2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['The psychology of happiness',\n",
+       " 'The benefits of mindfulness meditation',\n",
+       " 'The effects of climate change on ocean ecosystems',\n",
+       " 'The future of sustainable agriculture',\n",
+       " 'The history and culture of ancient civilizations',\n",
+       " 'The impact of social media on communication',\n",
+       " 'The role of education in society',\n",
+       " 'The benefits of regular exercise',\n",
+       " 'The impact of technology on human connection',\n",
+       " 'The future of renewable energy technology',\n",
+       " 'The psychology of creativity',\n",
+       " 'The impact of social media on mental health in adults',\n",
+       " 'The benefits of reading for pleasure',\n",
+       " 'The effects of stress on the body and mind',\n",
+       " 'The history and impact of the Renaissance',\n",
+       " 'The role of art in society',\n",
+       " 'The benefits of a plant-based diet',\n",
+       " 'The impact of social media on body image',\n",
+       " 'The future of space tourism',\n",
+       " 'The effects of sleep on overall health',\n",
+       " 'The role of music in society',\n",
+       " 'The benefits of volunteering',\n",
+       " 'The impact of technology on privacy and security',\n",
+       " 'The future of renewable energy storage',\n",
+       " 'The psychology of addiction and recovery',\n",
+       " 'The benefits of learning a new language',\n",
+       " 'The effects of air pollution on human health',\n",
+       " 'The history and culture of the Middle Ages',\n",
+       " 'The role of sports in society',\n",
+       " 'The benefits of spending time in nature',\n",
+       " 'The relationship between personality and career success',\n",
+       " 'How to foster creativity and innovation in the workplace',\n",
+       " 'The pros and cons of raising the minimum wage',\n",
+       " 'The impact of automation and AI on the job market',\n",
+       " 'Strategies for achieving work-life balance',\n",
+       " 'The benefits and risks of social media for business',\n",
+       " 'The keys to effective leadership and management',\n",
+       " 'How to optimize remote and hybrid work environments',\n",
+       " 'The future of space exploration and colonization',\n",
+       " 'The ethics of genetic engineering and human enhancement',\n",
+       " 'How social media has transformed human communication',\n",
+       " 'Strategies individuals can use to protect privacy and security online',\n",
+       " 'How virtual reality could transform education and training',\n",
+       " 'The potential benefits and risks of 3D printing technology',\n",
+       " 'How developing countries can leapfrog in adopting new technologies',\n",
+       " 'The promise and limitations of renewable energy',\n",
+       " 'How antibiotics resistance develops and spreads',\n",
+       " 'The potential benefits and risks of legalizing recreational drugs',\n",
+       " 'How travel can enrich our lives and boost wellbeing',\n",
+       " 'The promise and challenges of CRISPR gene-editing technology',\n",
+       " 'The impact of loneliness and social isolation on health',\n",
+       " 'How to inspire more students to pursue STEM careers',\n",
+       " 'The keys to making positive lifestyle changes that last',\n",
+       " 'How birth order affects personality and development',\n",
+       " 'How mixed-income housing can benefit communities',\n",
+       " 'The potential environmental benefits of vertical farming',\n",
+       " 'The psychological impact of natural disasters and climate events',\n",
+       " 'The pros and cons of space tourism',\n",
+       " \"How developing countries can improve women's access to education\",\n",
+       " 'The effects of gerrymandering on democratic representation',\n",
+       " 'How we can solve developing world hunger using technology',\n",
+       " 'The positive and negative impacts of desalination technology',\n",
+       " 'The potential benefits and downsides of cashless societies',\n",
+       " 'The pros and cons of trigger warnings in classrooms',\n",
+       " 'The keys to mitigating age-related memory loss',\n",
+       " 'How architecture and design impact our emotions and behavior',\n",
+       " 'The promise and limitations of CRISPR gene editing technology',\n",
+       " 'The impact of increasing life expectancy on societies',\n",
+       " 'How healthy gut bacteria influence mood and wellbeing',\n",
+       " 'The potential benefits and risks of lab-grown meat',\n",
+       " 'The pros and cons of trigger warnings in classrooms',\n",
+       " 'The keys to mitigating age-related memory loss',\n",
+       " 'How architecture and design impact our emotions and behavior',\n",
+       " 'The promise and limitations of CRISPR gene editing technology',\n",
+       " 'The impact of increasing life expectancy on societies',\n",
+       " 'How healthy gut bacteria influence mood and wellbeing',\n",
+       " 'The relationship between income, wealth and happiness',\n",
+       " 'The keys to maintaining cognitive health and preventing dementia',\n",
+       " 'How we can reform the juvenile justice system',\n",
+       " 'The role of the microbiome in human health and disease',\n",
+       " 'How urbanization is transforming communities',\n",
+       " 'The emotional and mental health impacts of natural disasters',\n",
+       " 'The ethical issues surrounding genome editing of embryos',\n",
+       " 'How social media has transformed human communication',\n",
+       " 'The impact of climate change on vulnerable populations',\n",
+       " 'The relationship between diet, exercise and mental health',\n",
+       " 'Howanguages shape the way we think and perceive the world',\n",
+       " 'The role of public art in building community and connection',\n",
+       " 'The pros and cons of legalizing marijuana',\n",
+       " 'How to raise independent, responsible children',\n",
+       " 'The keys to healthy romantic relationships',\n",
+       " 'The biological and environmental causes of addiction',\n",
+       " 'The role of pop culture and social media in shaping beauty ideals',\n",
+       " 'How society should respond to the aging population',\n",
+       " 'The most promising renewable and clean energy technologies',\n",
+       " 'The benefits and risks of homeschooling',\n",
+       " 'How universal healthcare could transform society',\n",
+       " 'The positive and negative impacts of globalization',\n",
+       " 'How to foster diversity, equity and inclusion in the workplace',\n",
+       " 'The potential benefits and risks of nanotechnology',\n",
+       " 'The impact of music, film and literature in reflecting society',\n",
+       " 'Why sleep is so critical for physical and mental health',\n",
+       " 'How we can improve end-of-life care for the elderly and terminally ill',\n",
+       " 'The relationship between income inequality and crime',\n",
+       " 'The impact of standardized testing on education',\n",
+       " 'How we can solve the student debt crisis',\n",
+       " 'The pros and cons of year-round schooling',\n",
+       " 'Why nutrition should be more emphasized in schools',\n",
+       " 'How we can close the gender pay gap',\n",
+       " 'How to reduce unconscious bias in the workplace',\n",
+       " 'The keys to successful parenting in the digital age',\n",
+       " 'The health benefits and risks of popular diets',\n",
+       " 'The ethics of factory farming and animal product consumption',\n",
+       " 'How diseases spread and pandemics emerge',\n",
+       " 'Privacy issues related to DNA sequencing and genetic testing',\n",
+       " 'The potential benefits and risks of human enhancement technologies',\n",
+       " 'The impact of space exploration on technological innovation',\n",
+       " 'The potential health benefits of medicinal marijuana',\n",
+       " 'The role of public transportation in building sustainable cities',\n",
+       " 'How developing countries can manage waste and pollution',\n",
+       " 'The ethics of human cloning and genetic engineering',\n",
+       " 'The keys to successful aging and longevity',\n",
+       " 'How we can get closer to achieving gender equality',\n",
+       " 'How veganism and plant-based diets are impacting the food industry',\n",
+       " 'The keys to managing stress in the modern world',\n",
+       " 'How we can make cities more livable and sustainable',\n",
+       " 'The potential benefits and risks of nanobots in medicine',\n",
+       " 'How quantum computing could transform technology and society',\n",
+       " 'The impact of automation on developing countries',\n",
+       " 'The keys to managing anxiety in children and teens',\n",
+       " 'How we can improve access to mental health resources',\n",
+       " 'The role of documentary films in shaping public discourse',\n",
+       " 'The effects of space travel on the human body and mind',\n",
+       " 'How we can address the loneliness epidemic',\n",
+       " 'The pros and cons of standardized educational curricula',\n",
+       " 'How developing nations can improve infrastructure',\n",
+       " 'The psychology behind common human biases and blindspots',\n",
+       " 'How music therapy is used to treat various conditions',\n",
+       " 'What leads to peak performance in sports and other domains',\n",
+       " 'How we can address ethical issues related to AI and automation',\n",
+       " 'The pros and cons of growing up as an only child',\n",
+       " 'How the gut microbiome influences the brain and behavior',\n",
+       " 'The impact of increasing smartphone and social media usage',\n",
+       " 'How we can inspire more students to pursue STEM careers',\n",
+       " 'The potential benefits and risks of legalizing marijuana',\n",
+       " 'What leads to optimal flow states and peak performance',\n",
+       " 'How light and noise pollution impact health and behavior',\n",
+       " 'The pros and cons of democracy versus authoritarianism',\n",
+       " 'How music and rhythm can enhance brain function and healing',\n",
+       " 'Strategies for maintaining work-life balance in demanding careers',\n",
+       " 'The environmental impact of meat consumption and animal agriculture',\n",
+       " 'The potential uses and misuses of persuasion psychology in business',\n",
+       " 'How diseases emerge and spread between humans and animals',\n",
+       " 'The psychological impact of colors, shapes and textures',\n",
+       " 'The pros and cons of multitasking and distraction while working',\n",
+       " 'The keys to mitigating age-related cognitive decline',\n",
+       " 'How biohacking could transform human health and performance',\n",
+       " 'The potential benefits and risks of legalizing some drugs']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[x['TOPIC'] for x in conv_list]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fe6b60a3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'TOPIC_ID': 1,\n",
+       " 'TOPIC': 'The psychology of happiness',\n",
+       " 'CONVERSATION': \"USER: I would like to discuss the topic of the psychology of happiness. \\n ASSISTANT: Sure, I'd be happy to talk about that with you! What specifically would you like to know? \\n USER: I'm interested in learning more about the factors that contribute to happiness. What are some things that people can do to increase their overall level of happiness? \\n ASSISTANT: That's a great question! Some of the key factors that contribute to happiness include positive social relationships, engaging in meaningful activities, and finding a sense of purpose or fulfillment in life. Additionally, factors like good physical health, financial stability, and a sense of autonomy can also play a role in our overall happiness. \\n USER: That's really interesting. What about the role of gratitude and mindfulness in happiness? \\n ASSISTANT: Gratitude and mindfulness are both important factors that have been linked to increased levels of happiness. Studies have shown that practicing gratitude, such as by keeping a gratitude journal or expressing appreciation to others, can increase positive emotions and decrease negative emotions. Similarly, mindfulness, which involves being fully present and non-judgmental in the moment, has been shown to have a positive impact on our overall well-being. \\n USER: That makes a lot of sense. I'm also curious about whether there are any differences in how happiness is experienced across different cultures. \\n ASSISTANT: Yes, cultural factors can play a role in how happiness is experienced and expressed. For example, some cultures may place greater emphasis on social relationships or community, while others may prioritize individual achievement or personal fulfillment. Additionally, the factors that contribute to happiness may vary depending on the cultural context, such as the role of spirituality or religion. \\n USER: That's really fascinating. Overall, it sounds like there are many different factors that can influence our happiness. \\n ASSISTANT: Yes, that's definitely true. While there is no one-size-fits-all approach to happiness, there are many strategies that people can use to increase their overall sense of well-being and fulfillment. By focusing on positive social relationships, engaging in meaningful activities, and cultivating a sense of gratitude and mindfulness, we can all work to increase our own levels of happiness. \\n USER: Great, this is the end of our discussion on the topic The psychology of happiness, let's talk about the next topic.\",\n",
+       " 'SOURCE': 'longchat'}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "conv_list[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d05fb360",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee15b521",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96516d1b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "62b7fae4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "158"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(conv_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3d6bf40",
+   "metadata": {},
+   "source": [
+    "# gen topic eval dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "30fbcc9c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import random\n",
+    "\n",
+    "np.random.seed(42) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "28594163-ecc3-4759-aff8-292d65557c1e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class Prompt:\n",
+    "    \"\"\"the prompt used for testing, composed of multiple  \"\"\"\n",
+    "    \n",
+    "    def __init__(self, id):\n",
+    "        self.id = id\n",
+    "        self.conv_list = []\n",
+    "        self.topic_list = []\n",
+    "\n",
+    "    def add_conv(self, conv):\n",
+    "        self.conv_list.append(conv)\n",
+    "        self.topic_list.append(conv['TOPIC'])\n",
+    "    \n",
+    "    def assemble_prompt(self):\n",
+    "        \n",
+    "        self.retrieval_id = 1 \n",
+    "        \n",
+    "        record_prompt = \"Below between '[[[' and ']]]' is a record of the previous conversations \" + \\\n",
+    "            f\"on {len(self.topic_list)} different topics between the ASSISTANT and \" + \\\n",
+    "            \"the USER. At the beginning of each topic, the USER will say \" + \\\n",
+    "            \"'I would like to discuss the topic of <TOPIC>'. Memorize each \" + \\\n",
+    "            \"<TOPIC>. At the end of the record, I will ask you to retrieve the \" + \\\n",
+    "            f\"first topic. Now the record start. \\nRECORD:\\n[[[\"\n",
+    "\n",
+    "        for conv in self.conv_list:\n",
+    "            record_prompt += conv['CONVERSATION']\n",
+    "            \n",
+    "        \n",
+    "        self.prompt = record_prompt\n",
+    "\n",
+    "        self.prompt_postfix = f\"]]]\\nNow \" + \\\n",
+    "            f\"the record ends. What is the first topic(s) in the record? Only give \" + \\\n",
+    "            \"me the topic name. Do not summarize yourself.\\nAnswer:\" \n",
+    "\n",
+    "        return self.prompt, self.prompt_postfix, self.retrieval_id-1, self.topic_list[self.retrieval_id-1]\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8d8fb185",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 6493.74it/s]\n",
+      "1it [00:00,  2.06it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86843\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2it [00:00,  2.15it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86941\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "3it [00:01,  2.21it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87157\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "4it [00:01,  2.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87181\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "5it [00:02,  2.31it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87464\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "6it [00:02,  2.32it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87233\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "7it [00:03,  2.29it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87847\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "8it [00:03,  2.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87687\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "9it [00:03,  2.28it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87117\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "10it [00:04,  2.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87501\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11it [00:04,  2.24it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86544\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "12it [00:05,  2.24it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87139\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13it [00:05,  2.24it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87159\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "14it [00:06,  2.23it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87962\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "15it [00:06,  2.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86984\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "16it [00:07,  2.20it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87240\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "17it [00:07,  2.20it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87324\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "18it [00:08,  2.20it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86328\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "19it [00:08,  2.22it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 87841\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "20it [00:08,  2.24it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wrote prompt_length: 86598\n",
+      "saved ../130_topics_extended_cnt20.jsonl\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "87204.49999999997"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "\n",
+    "output_dir = \"../\"\n",
+    "num_test_samples = 20\n",
+    "\n",
+    "ROWS = [130]\n",
+    "for num_topics in ROWS:\n",
+    "\n",
+    "    prompt_list = []\n",
+    "    prompt_len_list = []\n",
+    "    \n",
+    "    for i in tqdm(range(num_test_samples)):\n",
+    "        prompt = Prompt(i)\n",
+    "        indices = np.random.choice(list(range(len(conv_list))), size=num_topics, replace=len(conv_list) < num_topics)\n",
+    "\n",
+    "        for idx in indices:\n",
+    "            prompt.add_conv(conv_list[idx])\n",
+    "            \n",
+    "        prompt_list.append(prompt)\n",
+    "        \n",
+    "        prompt = None\n",
+    "    \n",
+    "    # write to output file\n",
+    "    avg_len = 0\n",
+    "\n",
+    "    output_path = os.path.join(output_dir, f\"{num_topics}_topics_extended_cnt{num_test_samples}.jsonl\")\n",
+    "    with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
+    "        for i, p in tqdm(enumerate(prompt_list)):\n",
+    "            pt, prompt_postfix, target_id, target_topic = p.assemble_prompt()\n",
+    "\n",
+    "            prompt_len = len(tokenizer.encode(pt))\n",
+    "\n",
+    "            prompt_len_list.append(prompt_len)\n",
+    "\n",
+    "            avg_len += prompt_len/len(prompt_list)\n",
+    "            \n",
+    "            curr_output = {\"test_id\": p.id, \n",
+    "                           \"prompt\": pt,\n",
+    "                           \"prompt_postfix\": prompt_postfix,\n",
+    "                           \"target_id\": target_id,\n",
+    "                           \"target_topic\": target_topic,\n",
+    "                           \"topics\": p.topic_list,\n",
+    "                           \"prompt_length\": prompt_len}\n",
+    "            json.dump(curr_output, f, ensure_ascii=False)\n",
+    "            f.write(\"\\n\")\n",
+    "            \n",
+    "            print(f\"wrote prompt_length: {prompt_len}\")\n",
+    "\n",
+    "    print(f\"saved {output_path}\")\n",
+    "\n",
+    "avg_len"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f97d43b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "bc403572",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      20 ../130_topics_extended_cnt20.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wc -l ../130_topics_extended_cnt20.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4920e218",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!head -n 1 ../130_topics_extended_cnt20.jsonl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51a61990-5885-49bc-b630-ea2e9b20ad4f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}