implement word list and image gen

2024-12-13 13:11:48 -06:00
commit 2f8f3415fc
11 changed files with 13012 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.idea
+output
+venv
--- a/data/banned_words.txt
+++ b/data/banned_words.txt
--- a/data/imageable.txt
+++ b/data/imageable.txt
--- a/data/nounlist.csv
+++ b/data/nounlist.csv
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
+openai~=1.57.3
--- a/src/init.py
+++ b/src/init.py
--- a/src/db.py
+++ b/src/db.py
@@ -0,0 +1,44 @@
+import sqlite3
+from pathlib import Path
+import uuid
+
+
+def create_db(db_path: Path):
+    conn = sqlite3.connect(db_path)
+    c = conn.cursor()
+
+    # Create the table if it doesn't exist
+    c.execute("""
+    CREATE TABLE IF NOT EXISTS images (
+        id TEXT PRIMARY KEY,
+        prompt TEXT,
+        model TEXT,
+        size TEXT,
+        quality TEXT,
+        filename TEXT
+    )
+    """)
+
+    # Commit and close the connection
+    conn.commit()
+    conn.close()
+
+
+def insert_into_db(db_path: Path, image_id: uuid, prompt: str, model: str = "dall-e-3", size: str = "1024x1024", quality: str = "standard"):
+    conn = sqlite3.connect(db_path)
+    c = conn.cursor()
+    # Insert the record into the database
+    c.execute(
+        "INSERT INTO images (id, prompt, model, size, quality, filename) VALUES (?, ?, ?, ?, ?, ?)",
+        (
+            image_id,
+            prompt,
+            model,
+            size,
+            quality,
+            f"{image_id}.png"
+        )
+    )
+    # Commit and close the connection
+    conn.commit()
+    conn.close()
--- a/src/image_gen.py
+++ b/src/image_gen.py
@@ -0,0 +1,47 @@
+from openai import OpenAI
+from pathlib import Path
+import uuid
+import requests
+from enum import Enum
+from typing import Literal
+
+class Styles(Enum):
+    emoji = "emoji"
+    pixel_art = "pixel art"
+    svg = "svg"
+    cartoon = "cartoon"
+
+client = OpenAI()
+
+
+def image_style(base_prompt: str, style: Styles) -> str:
+    return f"create {style.name} style of {base_prompt}"
+
+def icon_gen(prompt: str, quality: Literal["hd", "standard"], output: Path = "./output/"):
+    # Make sure output directory exists
+    output_path = Path(output) / quality
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Generate the image using the OpenAI client
+    response = client.images.generate(
+        model="dall-e-3",
+        prompt=prompt,
+        size="1024x1024",
+        quality=quality,
+        n=1,
+    )
+
+    # Extract the image URL
+    image_url = response.data[0].url
+
+    # Generate a UUID for the filename
+    image_id = str(uuid.uuid4())
+    image_filename = f"{image_id}.png"
+    image_filepath = output_path / image_filename
+
+    # Download the image
+    image_response = requests.get(image_url)
+    image_response.raise_for_status()
+    with open(image_filepath, "wb") as f:
+        f.write(image_response.content)
+    print(image_id)
--- a/src/wordlist.py
+++ b/src/wordlist.py
@@ -0,0 +1,95 @@
+import csv
+import os
+import requests
+import zipfile
+from io import BytesIO
+from pathlib import Path
+import nltk
+from nltk.corpus import wordnet
+
+
+def get_noun_list(output_dir: Path = Path("../data")) -> list[str]:
+    nounlist = output_dir / "nounlist.csv"
+    if not nounlist.exists():
+        # URL of the ZIP file
+        url = "https://www.kaggle.com/api/v1/datasets/download/leite0407/list-of-nouns"
+
+        # Create the data directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Download the ZIP file
+        print("Downloading the ZIP file...")
+        response = requests.get(url, stream=True)
+
+        if response.status_code == 200:
+            print("Download complete. Extracting files...")
+            with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
+                zip_ref.extractall(output_dir)
+            print(f"Files extracted to {output_dir}")
+        else:
+            raise Exception(f"Failed to download the file. Status code: {response.status_code}")
+
+    with open(nounlist, mode='r', encoding='utf-8') as csv_file:
+        reader = csv.reader(csv_file)
+        return [row[0] for row in reader if row]  # Skip empty rows
+
+
+
+def get_banned_wordlist(output_dir: Path) -> list[str]:
+    banned_word_file = output_dir / "banned_words.txt"
+
+    if not banned_word_file.exists():
+        print("getting banned words")
+        # Sources of banned words
+        sources = [
+            "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en",
+            "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
+        ]
+
+        banned_words = set()
+
+        # Download and combine
+        for url in sources:
+            response = requests.get(url)
+            if response.status_code == 200:
+                banned_words.update(response.text.splitlines())
+
+        # Save to banned_words.txt
+        with open(banned_word_file, "w") as file:
+            file.write("\n".join(sorted(banned_words)))
+
+        print("saved banned words")
+
+    with open(banned_word_file, "r") as file:
+        banned_words = file.read().splitlines()
+        if len(banned_words) <= 0:
+            raise Exception("no banned words")
+        return  banned_words
+
+
+def filter_banned_words_list(noun_list: list[str], banned_words: list[str]) -> list[str]:
+    filtered_list = set(noun_list) - set(banned_words)
+    print(f"Removed {len(noun_list) - len(filtered_list)} words")
+    return list(filtered_list)
+
+
+def filter_nonimageable_words(nouns: list[str]) -> list[str]:
+    return [noun for noun in nouns if is_imageable(noun)]
+
+
+def is_imageable(noun):
+    # Get all the noun synsets for the given word
+    synsets = wordnet.synsets(noun, pos=wordnet.NOUN)
+    if not synsets:
+        return False
+
+    # The canonical synset for "physical_entity"
+    physical_entity = wordnet.synset('physical_entity.n.01')
+
+    for syn in synsets:
+        # Traverse up the hypernym tree
+        for ancestor in syn.closure(lambda s: s.hypernyms()):
+            if ancestor == physical_entity:
+                return True
+
+    return False
--- a/tests/image_gen_test.py
+++ b/tests/image_gen_test.py
@@ -0,0 +1,6 @@
+from src.image_gen import image_style, Styles, icon_gen
+from pathlib import Path
+
+if __name__ == "__main__":
+    dog_emoji = image_style("dog in a hat on a beach with a drink", style=Styles.cartoon)
+    icon_gen(dog_emoji, output=Path("../output"), quality="hd")
--- a/tests/wordlist_test.py
+++ b/tests/wordlist_test.py
@@ -0,0 +1,16 @@
+from src.wordlist import get_noun_list, get_banned_wordlist, filter_banned_words_list, filter_nonimageable_words
+from pathlib import Path
+
+if __name__ == "__main__":
+    nounlist = get_noun_list(Path("../data"))
+    banned_words = get_banned_wordlist(Path("../data"))
+    print(len(nounlist))
+    print(len( banned_words))
+    filtered_words = filter_banned_words_list(noun_list=nounlist, banned_words=banned_words)
+    print(len(filtered_words))
+
+    imageable = filter_nonimageable_words(filtered_words)
+    print(len(imageable))
+    print(f"imageable: {len(filtered_words) - len(imageable)}")
+    with open("../data/imageable.txt", "w") as fp:
+        fp.write("\n".join(imageable))