forked from clint-kristopher-morris/llm-guided-evolution
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_improved.py
916 lines (775 loc) · 39.5 KB
/
run_improved.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
import os
import copy
import glob
import time
import string
import random
import pickle
import argparse
import subprocess
import numpy as np
from deap import base, creator, tools
from deap.tools import HallOfFame
from src.utils.print_utils import print_population, print_scores, box_print, print_job_info
from src.llm_utils import split_file, retrieve_base_code, mutate_prompts
from src.cfg.constants import *
def print_ancestery(data):
for gene in data.keys():
print(f'gene: {gene}')
print(f"\t{data[gene]['GENES']}")
print(f"\t{data[gene]['MUTATE_TYPE']}")
def update_ancestry(gene_id_child, gene_id_parent, ancestery, mutation_type=None, gene_id_parent2=None):
"""
Updates the ancestry data for a given child gene based on its parent(s).
:param gene_id_child: The ID of the child gene.
:param gene_id_parent: The ID of the first parent gene.
:param ancestery: The global data structure for ancestry.
:param mutation_type: The type of mutation (for the first part of the code). Default is None.
:param gene_id_parent2: The ID of the second parent gene (for the second part of the code). Default is None.
"""
# Common part for both functionalities
ancestery[gene_id_child] = copy.deepcopy(ancestery[gene_id_parent])
# Handle the specifics for either part 1 or part 2
if gene_id_parent2 is None:
# Part 1 functionality
ancestery[gene_id_child]['GENES'] = copy.deepcopy(ancestery[gene_id_parent]['GENES']) + [gene_id_child]
ancestery[gene_id_child]['MUTATE_TYPE'] = copy.deepcopy(ancestery[gene_id_parent]['MUTATE_TYPE']) + [mutation_type]
else:
# Part 2 functionality
cross_id = f'P:{gene_id_parent2}-C:{gene_id_child}'
ancestery[gene_id_child]['GENES'] = copy.deepcopy(ancestery[gene_id_parent]['GENES']) + [cross_id]
ancestery[gene_id_child]['MUTATE_TYPE'] = copy.deepcopy(ancestery[gene_id_parent]['MUTATE_TYPE']) + ["CrossOver"]
return ancestery
def generate_template(PROB_EOT, GEN_COUNT, TOP_N_GENES, SOTA_ROOT, SEED_NETWORK, ROOT_DIR):
"""
Generates a template based on given probabilities and gene information.
:param PROB_EOT: Probability for End of Tree (EoT) operation.
:param GEN_COUNT: Current generation count.
:param TOP_N_GENES: List of top N genes.
:param SOTA_ROOT: Directory path for state-of-the-art root.
:param SEED_NETWORK: Seed network file path.
:param ROOT_DIR: Root directory for templates.
:return: A tuple containing the template text and the mutation type.
"""
if (PROB_EOT > np.random.uniform()) and (GEN_COUNT > 0):
print("\t‣ EoT")
top_gene = np.random.choice([x[0] for x in TOP_N_GENES])
parts_x = split_file(f"{SOTA_ROOT}/models/network_{top_gene}.py")
parts_y = split_file(SEED_NETWORK)
parts = [(x.strip(), y.strip(), idx) for idx, (x, y) in enumerate(zip(parts_x[1:], parts_y[1:]))]
random.shuffle(parts)
for x, y, augment_idx in parts:
if x.strip() != y.strip():
break
eot_template_path = os.path.join(ROOT_DIR, 'templates/EoT/EoT.txt')
with open(eot_template_path, 'r') as file:
eot_template_txt = file.read()
template_txt = eot_template_txt.format(x, y, "{}")
mute_type = "EoT"
else:
print("\t‣ FixedPrompts")
prompt_templates = glob.glob(f'{ROOT_DIR}/templates/FixedPrompts/*/*.txt')
template_path = np.random.choice(prompt_templates)
mute_type = os.path.basename(template_path).split('.')[0] # Assuming the file extension needs to be removed
with open(template_path, 'r') as file:
template_txt = file.read()
with open(f'{ROOT_DIR}/templates/ConstantRules.txt', 'r') as file:
rules_txt = file.read()
template_txt = f'{template_txt}\n{rules_txt}'
return template_txt, mute_type
"""
Main Job Functions
"""
def write_bash_script(input_filename_x=f'{SOTA_ROOT}/network.py',
input_filename_y=None,
output_filename=f'{SOTA_ROOT}/models/network_x.py',
gpu='TeslaV100-PCIE-32GB',
python_file='src/llm_mutation.py',
top_p=0.1, temperature=0.2,
):
def fetch_gene(filepath):
return os.path.basename(filepath).replace('network_','').replace('.py','')
global GLOBAL_DATA_ANCESTERY
QC_CHECK_BOOL = PROB_QC > np.random.uniform()
# Extract the directory path from the file path
dir_path = os.path.dirname(output_filename)
# Create the directory, ignore error if it already exists
os.makedirs(dir_path, exist_ok=True)
gene_id_parent = fetch_gene(input_filename_x)
gene_id_child = fetch_gene(output_filename)
if python_file=='src/llm_mutation.py':
template_txt, mute_type = generate_template(PROB_EOT, GEN_COUNT, TOP_N_GENES,
SOTA_ROOT, SEED_NETWORK, ROOT_DIR)
if GEN_COUNT >= 0: # this does not need to happen at creation of population
GLOBAL_DATA_ANCESTERY = update_ancestry(gene_id_child, gene_id_parent, GLOBAL_DATA_ANCESTERY,
mutation_type=mute_type, gene_id_parent2=None)
# print(gene_id_child); print(GLOBAL_DATA_ANCESTERY[gene_id_parent])
out_dir = str(GENERATION)
file_path = os.path.join(out_dir, f'{gene_id_child}_model.txt')
os.makedirs(out_dir, exist_ok=True)
with open(file_path, 'w') as file:
file.write(template_txt)
temp_text = f'{python_file} {input_filename_x} {output_filename} {file_path} --top_p {top_p} --temperature {temperature}'
python_runline = f"python {temp_text} --apply_quality_control '{QC_CHECK_BOOL}' --hugging_face {HUGGING_FACE_BOOL}"
elif python_file=='src/llm_crossover.py':
gene_id_parent2 = fetch_gene(input_filename_y)
GLOBAL_DATA_ANCESTERY = update_ancestry(gene_id_child, gene_id_parent, GLOBAL_DATA_ANCESTERY,
mutation_type=None, gene_id_parent2=gene_id_parent2)
temp_text = f"{python_file} {input_filename_x} {input_filename_y} {output_filename} --top_p {top_p} --temperature {temperature}"
python_runline = f"python {temp_text} --apply_quality_control '{QC_CHECK_BOOL}' --hugging_face {HUGGING_FACE_BOOL}"
else:
raise ValueError("Invalid python_file argument")
bash_script_content = LLM_BASH_SCRIPT_TEMPLATE.format(gpu, python_runline)
return bash_script_content
def create_bash_file(file_path, **kwargs):
bash_script_content = write_bash_script(**kwargs)
# Extract the directory from the file path
directory = os.path.dirname(file_path)
# Check if the directory exists, and create it if it doesn't
if not os.path.exists(directory):
os.makedirs(directory)
# Write the file
with open(file_path, 'w') as file:
file.write(bash_script_content)
print(f"\t‣ Bash script saved to {file_path}", flush=True)
def submit_bash(file_path, **kwargs):
""" This should be general for subbing anything and returning:
successful_sub_flag
job_id
"""
create_bash_file(file_path, **kwargs)
result = subprocess.run([RUN_COMMAND, file_path], capture_output=True, text=True)
local_output = None
if result.returncode == 0 and LOCAL:
local_output = result.stdout.strip()
print("\t‣ Output:", result.stdout.strip(), flush=True)
job_id = None
successful_sub_flag = True
elif result.returncode == 0:
print("\t‣ Output:", result.stdout.strip(), flush=True)
# print("\t‣ Script Submitted Successfully.\n\t‣ Output:", result.stdout.strip(), flush=True)
successful_sub_flag = True
job_id = result.stdout.split('job ')[-1].strip()
else:
print("\t‣ Failed to Submit Script.\n\t‣ Error:", result.stderr.strip(), flush=True)
successful_sub_flag = False
job_id = None
return successful_sub_flag, job_id, local_output
def check_contents_for_error(contents):
"""
Checks the output of a job for any signs of error.
Parameters:
contents (str): output of job to check for error
Returns:
bool: True if job completed successfully, False if error, None if neither.
"""
# Check for error indicators in the file
if "traceback" in contents.lower() or "slurmstepd: error" in contents.lower():
print("\t☠ Error Found in LLM Job Output.", flush=True)
return False
elif "job done" in contents.lower():
print("\t☑ LLM Job Completed Successfully.", flush=True)
return True
else:
return None
def check4job_completion(job_id, local_output=None, check_interval=60, timeout=3600*3):
"""
Check for the completion of a job by searching for its output file and scanning for errors.
Parameters:
job_id (str): The job ID to check.
check_interval (int): Time in seconds between checks.
timeout (int): Maximum time in seconds to wait for job completion.
Returns:
bool: True if job completed successfully, False otherwise.
"""
if local_output is not None:
state = check_contents_for_error(local_output)
if state is None:
raise Exception('Unexpected output from job')
else:
return state
start_time = time.time()
output_file = f'slurm-{job_id}.out'
while True:
# Check if the timeout is reached
if time.time() - start_time > timeout:
print("Timeout reached while waiting for job completion.")
return False
# Check if the output file exists
if os.path.exists(output_file):
with open(output_file, 'r') as file:
contents = file.read()
state = check_contents_for_error(contents)
if state is None:
pass
else:
return state
# Wait for some time before checking again
time.sleep(check_interval)
print(f'\t‣ Waiting on check4job_completion LLM job: {job_id} Time: {round(time.time() - start_time)}s', flush=True)
def generate_random_string(length=20):
# Define the characters that can be used in the string
characters = string.ascii_letters + string.digits
# Generate a random string of specified length
random_string = ''.join(random.choice(characters) for i in range(length))
random_string = 'xXx'+random_string
return random_string
def create_individual(container, temp_min=0.05, temp_max=0.4):
box_print("Create Individual", print_bbox_len=60, new_line_end=False)
out_dir = str(GENERATION)
gene_id = generate_random_string(length=24)
# Select prompte and temp
temperature = round(random.uniform(temp_min, temp_max), 2)
# Assign a file path and name for the model creation bash
file_path = os.path.join(out_dir, f'{gene_id}.sh')
successful_sub_flag, job_id, local_output = submit_bash(file_path,
input_filename_x=f'{SOTA_ROOT}/network.py',
output_filename =f'{SOTA_ROOT}/models/network_{gene_id}.py',
gpu=LLM_GPU,
python_file='src/llm_mutation.py',
top_p=0.1, temperature=temperature)
# Log data
GLOBAL_DATA[gene_id] = {'sub_flag':successful_sub_flag, 'job_id':job_id,
'status':'subbed file', 'fitness':None, 'start_time':time.time()}
GLOBAL_DATA_ANCESTERY[gene_id] = {'GENES':[gene_id], 'MUTATE_TYPE':["CREATED"]}
individual = container([gene_id]) # Assign a file ID
if DELAYED_CHECK:
GLOBAL_DATA[gene_id]['status'] = 'DELAYED_CHECK'
individual = creator.Individual([gene_id])
return individual
if successful_sub_flag:
if job_id is not None:
print(f'Checking for Job Completion: {job_id} for {gene_id}', flush=True)
else:
print(f'Checking completion for {gene_id}', flush=True)
job_done = check4job_completion(job_id=job_id, local_output=local_output)
# print(f'Model Files for {gene_id} are Loaded') if job_done else print(f'Error Loading Model Files for {gene_id}', flush=True)
return individual
def submit_run(gene_id):
def write_bash_script_py(gene_id, train_file='./sota/ExquisiteNetV2/train.py'):
if not MACOS:
tmp = f"-data {DATA_PATH} -end_lr 0.001 -seed 21 -val_r 0.2 -amp"
else:
tmp = f"-data {DATA_PATH} -end_lr 0.001 -seed 21 -val_r 0.2 -epoch 2"
# python_runline = f'python {train_file} -bs 216 -epoch 2 -network "models.network_{gene_id}" {tmp}'
python_runline = f'python {train_file} -bs 216 -network "models.network_{gene_id}" {tmp}'
bash_script_content = PYTHON_BASH_SCRIPT_TEMPLATE.format(python_runline)
return bash_script_content
# This is for subbing the python code
def create_bash_file_py(file_path, gene_id, **kwargs):
bash_script_content = write_bash_script_py(gene_id, **kwargs)
with open(file_path, 'w') as file:
file.write(bash_script_content)
print(f"\t‣ Bash Script Saved to {file_path}")
def submit_bash_py(file_path, gene_id, **kwargs):
create_bash_file_py(file_path, gene_id, **kwargs)
job_id = None
successful_sub_flag = False
local_output = None
result = subprocess.run([RUN_COMMAND, file_path], capture_output=True, text=True)
if LOCAL:
local_output = result.stdout.strip() + '\n' + result.stderr.strip()
print("\t‣ Output:", local_output, flush=True)
job_id = None
successful_sub_flag = True
elif result.returncode == 0:
print("\t‣ Script Submitted Successfully.\n\t‣ Output:", result.stdout.strip())
successful_sub_flag = True
job_id = result.stdout.split('job ')[-1].strip()
else:
print("\t‣ Failed to Submit script.\n\t‣ Error:", result.stderr.strip())
successful_sub_flag = False
job_id = None
return successful_sub_flag, job_id, local_output
out_dir = str(GENERATION)
file_path = os.path.join(out_dir, f'{gene_id}_model.sh')
successful_sub_flag, job_id, local_output = submit_bash_py(file_path, gene_id)
GLOBAL_DATA[gene_id]['status'] = 'running eval'
GLOBAL_DATA[gene_id]['results_job'] = job_id
GLOBAL_DATA[gene_id]['local_output'] = local_output
print(f'\t‣ Running py File for {gene_id}, {job_id}')
def evalModel(individual):
gene_id = individual[0]
# Initially, we don't have a fitness value
return None
def check4model2run(gene_id):
# model_path = os.path.join(str(GENERATION), f'{gene_id}_model.txt')
print(f'Checking for: SOTA_ROOT ./models/network_{gene_id}.py')
model_path = f'{SOTA_ROOT}/models/network_{gene_id}.py'
if os.path.exists(model_path):
if GLOBAL_DATA[gene_id]['status'] != 'running eval':
submit_run(gene_id)
def check4results(gene_id):
def check4error(gene_id):
job_id = GLOBAL_DATA[gene_id]['results_job']
if GLOBAL_DATA[gene_id]['local_output'] is not None:
state = check_contents_for_error(GLOBAL_DATA[gene_id]['local_output'])
if state is None:
print(GLOBAL_DATA[gene_id]['local_output'], flush=True)
raise Exception('Unexpected output from job')
else:
return state
# there is no local output, so process with slurm
output_file = f'slurm-{job_id}.out'
# Check if the output file exists
if os.path.exists(output_file):
with open(output_file, 'r') as file:
contents = file.read()
state = check_contents_for_error(contents)
if state is None:
pass
else:
return state
return None
job_done = check4error(gene_id)
if job_done is True:
out_dir = str(GENERATION)
# The job saves the model results to a file f'{gene_id}_results.txt'
# results_path = os.path.join(out_dir, f'{gene_id}_results.txt')
results_path = f'{SOTA_ROOT}/results/{gene_id}_results.txt'
with open(results_path, 'r') as file:
results = file.read()
results = results.split(',')
fitness = [float(r.strip()) for r in results]
# TODO: get all features later
fitness = [fitness[0], fitness[1]]
fitness = tuple(fitness)
GLOBAL_DATA[gene_id]['status'] = 'completed'
GLOBAL_DATA[gene_id]['fitness'] = fitness
# print(f'Model from Gene: {gene_id} Evaluated')
elif job_done is False:
GLOBAL_DATA[gene_id]['status'] = 'completed'
GLOBAL_DATA[gene_id]['fitness'] = INVALID_FITNESS_MAX
# print(f'Model from Gene: {gene_id} Failed to Run')
else:
# print('Job Has Not Finished Running Yet...', flush=True)
pass
def check_and_update_fitness(population, timeout=3600*30, loop_delay=60*30):
""" This function submits jobs and then if submitted it checks for four possibilities.
timeout: (int): seconds until the model run is killed and assigned the max error
loop_delay (int): seconds until iterating over the jobs
for a job four are four possibilities:
‣ The Model for Gene: {gene_id} Failed to Run
‣ The Model for Gene: {gene_id} Completed Successfully!
‣ Still Waiting On: Gene: {gene_id}
‣ Timeout for gene ID {gene_id}
"""
# Set a timeout for each job
box_print("SUBMITTING MODELS CREATED BY LLM")
count = 0
while True:
box_print(f"Checking Model Runs: {count}", print_bbox_len=60, new_line_end=False)
all_done = True
for ind in population:
gene_id = ind[0]
# check if failed job
if gene_id not in GLOBAL_DATA:
GLOBAL_DATA[gene_id] = {'sub_flag':False, 'job_id':'None', 'status':'completed',
'fitness':INVALID_FITNESS_MAX, 'start_time':time.time()}
if GLOBAL_DATA[gene_id]['sub_flag']==False:
ind.fitness.values = INVALID_FITNESS_MAX # Max error
GLOBAL_DATA[gene_id]['status'] == "completed"
if ind.fitness.values == PLACEHOLDER_FITNESS: # If fitness not assigned
# check for gene_id_model.txt file
if GLOBAL_DATA[gene_id]['status'] == 'subbed file':
# here we generate/sub the model file
# also we assign GLOBAL_DATA[gene_id]['status'] = 'running eval' in check4model2run
check4model2run(gene_id)
if GLOBAL_DATA[gene_id]['status'] == 'running eval':
print(f'Checking the Results for Gene: {gene_id}')
no_error_flag = check4results(gene_id) # here we look for the results file for gene_id
if no_error_flag == False:
print(f"LLM Failed for Gene: {gene_id}")
ind.fitness.values = INVALID_FITNESS_MAX
GLOBAL_DATA[gene_id]['status'] = 'completed'
if GLOBAL_DATA[gene_id]['status'] == "completed":
# Process results and assign fitness
fitness_tuple = GLOBAL_DATA[gene_id]['fitness'] # Implement this function
ind.fitness.values = fitness_tuple
elif time.time() - GLOBAL_DATA[gene_id]['start_time'] > timeout:
print(f"Timeout for gene ID {gene_id}")
ind.fitness.values = INVALID_FITNESS_MAX
GLOBAL_DATA[gene_id]['status'] = 'FAILED: TIMEOUT'
else:
if 'results_job' not in GLOBAL_DATA[gene_id].keys():
ind.fitness.values = INVALID_FITNESS_MAX # Max error
print(f'\t☠ No Placeholder Fitness for: {gene_id}')
GLOBAL_DATA[gene_id]['status'] == "completed"
else:
print(f"\t‣ Still Waiting On: Gene: {gene_id}", flush=True)
print_job_info(GLOBAL_DATA[gene_id])
all_done = False # Some jobs are still running
if all_done:
box_print("Evalutated All Genes", print_bbox_len=60)
break # All jobs are done or timed out
print('Delayed...', flush=True)
time.sleep(loop_delay) # Wait some time before checking again
count+=1
def update_individual(ind, new_gene_id, old_gene_id=None, process_success=True, process_type='Mutation'):
"""
Update an individual based on the success or failure of a process.
:param ind: The individual to be updated.
:param new_gene_id: The new gene ID to be assigned to the individual.
:param old_gene_id: The old gene ID to be removed from GLOBAL_DATA. Optional.
:param process_success: Flag indicating if the process was successful. Default is True.
:param process_type: Type of process ('Mutation', 'Mating', etc.). Default is 'Mutation'.
"""
operation = 'Mutated' if process_type == 'Mutation' else 'Mated'
if process_success:
ind[0] = new_gene_id
ind = creator.Individual([new_gene_id])
if old_gene_id is not None and old_gene_id in GLOBAL_DATA.keys():
del GLOBAL_DATA[old_gene_id]
print(f'\t☑ {operation}: {new_gene_id}')
# GLOBAL_DATA_ANCESTERY[new_gene_id] = {'SCORES':[], 'GENES':[], 'CROSS_OVERS':{}, 'MUTATE_TYPE':[]}
else:
print(f'\t☠ Failed {operation}: {new_gene_id}')
if new_gene_id in GLOBAL_DATA.keys():
del GLOBAL_DATA[new_gene_id]
if old_gene_id is not None:
ind[0] = old_gene_id
ind = creator.Individual([old_gene_id])
return ind
# TODO: I need to cycle through by the job id to match the sub order
def delayed_mate_check(offspring):
if DELAYED_CHECK is True:
for individual in offspring:
k = individual[0]
if k in GLOBAL_DATA and GLOBAL_DATA[k]["status"] == "DELAYED_CHECK":
GLOBAL_DATA[k]["status"]="subbed file"
successful_sub_flag = GLOBAL_DATA[k]["sub_flag"]
new_gene_id, job_id = k, GLOBAL_DATA[k]["job_id"]
print(f'Delayed Mating Check: {new_gene_id}, LLM Job ID: {job_id}')
print(f'\t‣ Checking for Crossover Job Completion: {job_id} for {new_gene_id}')
job_done = check4job_completion(job_id)
if job_done:
print(f'\t‣ Model Files for {new_gene_id} are Loaded', flush=True)
else:
print(f'\t‣ Error Loading Model Files for {new_gene_id}!!', flush=True)
failed_process = not (successful_sub_flag and job_done)
if failed_process:
new_gene_id = LINKED_GENES[k]
old_gene_id = k
else:
new_gene_id = k
old_gene_id = LINKED_GENES[k]
individual = update_individual(individual, new_gene_id, old_gene_id=old_gene_id,
process_success=not failed_process, process_type='Mating')
return offspring
def delayed_creation_check(offspring):
if DELAYED_CHECK is True:
for individual in offspring:
k = individual[0]
if k in GLOBAL_DATA and GLOBAL_DATA[k]["status"] == "DELAYED_CHECK":
GLOBAL_DATA[k]["status"]="subbed file"
successful_sub_flag = GLOBAL_DATA[k]["sub_flag"]
if successful_sub_flag:
gene_id = k
job_id = GLOBAL_DATA[k]["job_id"]
print(f'Checking for Job Completion: {job_id} for {gene_id}', flush=True)
job_done = check4job_completion(job_id)
return offspring
def delayed_mutate_check(offspring):
if DELAYED_CHECK is True:
for individual in offspring:
k = individual[0]
if k in GLOBAL_DATA and GLOBAL_DATA[k]["status"] == "DELAYED_CHECK":
GLOBAL_DATA[k]["status"]="subbed file"
successful_sub_flag = GLOBAL_DATA[k]["sub_flag"]
if successful_sub_flag:
new_gene_id = k
job_id = GLOBAL_DATA[k]["job_id"]
print(f'Delayed Mutation Check: {new_gene_id}, LLM Job ID: {job_id}', flush=True)
print(f'\t‣ Checking for Creation Job Completion: {job_id} for {new_gene_id}')
job_done = check4job_completion(job_id)
if job_done:
print(f'\t‣ Model Files for {new_gene_id} are Loaded')
else:
print(f'\t☠ Error Loading Model Files for {new_gene_id}')
failed_process = not (successful_sub_flag and job_done)
old_gene_id = LINKED_GENES[k]
individual = update_individual(individual, new_gene_id, old_gene_id=old_gene_id,
process_success=not failed_process, process_type='Mutation')
return offspring
# Custom crossover function
def customCrossover(ind1, ind2):
def combine_elements(ind1, ind2, temp_min=0.05, temp_max=0.1):
"""
Combine elements of two individuals to create a new individual.
Parameters:
ind1, ind2 (list): The parent individuals.
Returns:
str: The gene ID of the new individual.
"""
global GLOBAL_DATA
out_dir = str(GENERATION)
# Retrieve gene IDs from the individuals
gene_id_1 = ind1[0]
gene_id_2 = ind2[0]
# Generate the crossover query
print(f'Mating: {gene_id_1} and {gene_id_2}')
temperature = round(random.uniform(temp_min, temp_max), 2)
# Generate a new gene ID for the offspring
new_gene_id = generate_random_string(length=24)
# Create the bash file for the new job
file_path = os.path.join(out_dir, f'{new_gene_id}.sh')
successful_sub_flag, job_id, local_output = submit_bash(file_path,
input_filename_x=f'{SOTA_ROOT}/models/network_{gene_id_1}.py',
input_filename_y=f'{SOTA_ROOT}/models/network_{gene_id_2}.py',
output_filename=f'{SOTA_ROOT}/models/network_{new_gene_id}.py',
gpu=LLM_GPU,
python_file='src/llm_crossover.py',
top_p=0.1, temperature=temperature)
# Update global data for the new individual
GLOBAL_DATA[new_gene_id] = {'sub_flag':successful_sub_flag, 'job_id':job_id,
'status':'subbed file', 'fitness':None, 'start_time':time.time()}
if DELAYED_CHECK:
GLOBAL_DATA[new_gene_id]['status'] = 'DELAYED_CHECK'
return new_gene_id, None
if successful_sub_flag:
print(f'\t‣ Checking for Crossover Job Completion: {job_id} for {new_gene_id}')
job_done = check4job_completion(job_id, local_output)
if job_done:
print(f'\t‣ Model Files for {new_gene_id} are Loaded')
else:
print(f'\t‣ Error Loading Model Files for {new_gene_id}!!')
failed_process = True if (successful_sub_flag is False) or (job_done is False) else False
# Return the new gene ID
return new_gene_id, failed_process
global GLOBAL_DATA
global DELAYED_CHECK
new_gene_id1, failed_process1 = combine_elements(ind1, ind2)
new_gene_id2, failed_process2 = combine_elements(ind2, ind1)
if DELAYED_CHECK:
LINKED_GENES[new_gene_id1] = ind1[0]
LINKED_GENES[new_gene_id2] = ind2[0]
ind1[0] = new_gene_id1
ind2[0] = new_gene_id2
offspring1 = creator.Individual([new_gene_id1])
offspring2 = creator.Individual([new_gene_id2])
return offspring1, offspring2
offspring1 = update_individual(ind1, new_gene_id1, old_gene_id=ind1[0],
process_success=(not failed_process1), process_type='Mating')
offspring2 = update_individual(ind2, new_gene_id2, old_gene_id=ind2[0],
process_success=(not failed_process2), process_type='Mating')
return offspring1, offspring2
def customMutation(individual, indpb, temp_min=0.02, temp_max=0.35):
""" Custom mutation function that randomly changes the temperature parameter of the individual's task and assigns a new ID.
Parameters:
individual (list): The individual to be mutated.
indpb (float): The probability of mutating each gene.
Returns:
tuple: The mutated individual.
"""
# Check if mutation occurs (based on the mutation probability)
# if random.random() < indpb: # TODO: connect this to temp
global DELAYED_CHECK
out_dir = str(GENERATION)
old_gene_id = individual[0]
# Generate a new gene ID
new_gene_id = generate_random_string(length=24)
print(f'Mutating: {old_gene_id} and Replaceing with: {new_gene_id}')
# Name of the sh bash file
file_path = os.path.join(str(GENERATION), f'{new_gene_id}.sh')
temperature = round(random.uniform(temp_min, temp_max), 2)
successful_sub_flag, job_id, local_output = submit_bash(file_path,
input_filename_x= f'{SOTA_ROOT}/models/network_{old_gene_id}.py',
output_filename = f'{SOTA_ROOT}/models/network_{new_gene_id}.py',
gpu=LLM_GPU,
python_file='src/llm_mutation.py',
top_p=0.1, temperature=temperature)
# Update the individual with the new gene ID
# individual[0] = new_gene_id
# Update the global data with the new task
GLOBAL_DATA[new_gene_id] = {'sub_flag':successful_sub_flag, 'job_id':job_id,
'status':'subbed file', 'fitness':None, 'start_time':time.time()}
if DELAYED_CHECK:
LINKED_GENES[new_gene_id] = individual[0]
GLOBAL_DATA[new_gene_id]['status'] = 'DELAYED_CHECK'
individual[0] = new_gene_id
individual = creator.Individual([new_gene_id])
return individual
if successful_sub_flag:
print(f'\t‣ Checking for Mutation Job Completion: {job_id} for {new_gene_id}')
job_done = check4job_completion(job_id, local_output)
if job_done:
print(f'\t‣ Model Files for {new_gene_id} are Loaded')
else:
print(f'\t☠ Error Loading Model Files for {new_gene_id}')
failed_process = not (successful_sub_flag and job_done)
individual = update_individual(individual, new_gene_id, old_gene_id,
process_success=(not failed_process), process_type='Mutation')
return individual
def remove_duplicates(population):
unique_individuals = []
seen_chromosomes = set()
for individual in population:
# Convert chromosome to a tuple since lists are not hashable
chromosome = tuple(individual)
if chromosome not in seen_chromosomes:
unique_individuals.append(individual)
seen_chromosomes.add(chromosome)
return unique_individuals
# --- Checkpoint Functions --- #
def save_checkpoint(gen, folder_name="checkpoints"):
os.makedirs(folder_name, exist_ok=True)
checkpoint_data = {
"GLOBAL_DATA": GLOBAL_DATA,
"GLOBAL_DATA_HIST": GLOBAL_DATA_HIST,
"population": population,
"hof": hof,
"GLOBAL_DATA_ANCESTERY":GLOBAL_DATA_ANCESTERY,
}
filename = os.path.join(folder_name, f'checkpoint_gen_{gen}.pkl')
with open(filename, 'wb') as file:
pickle.dump(checkpoint_data, file)
print(f"Checkpoint saved as {filename}")
def load_checkpoint(folder_name="checkpoints", checkpoint_file=None):
if not os.path.exists(folder_name):
return None, None
if checkpoint_file is None:
checkpoint_files = sorted(os.listdir(folder_name), reverse=True)
checkpoint_file = checkpoint_files[0] if checkpoint_files else None
if checkpoint_file:
filepath = os.path.join(folder_name, checkpoint_file)
with open(filepath, 'rb') as file:
checkpoint_data = pickle.load(file)
print(f"Loaded checkpoint from {filepath}")
start_gen = int(checkpoint_file.split('_')[2].split('.')[0])
start_gen = start_gen + 1
return checkpoint_data, start_gen
return None, None
def true_nsga2(pop, k):
pop = tools.selNSGA2(pop, len(pop)) # 10 diff
new_pop = tools.selTournamentDCD(pop, k) # mults of 4
return new_pop
# Define the problem
creator.create("FitnessMulti", base.Fitness, weights=FITNESS_WEIGHTS) # Adjust weights as needed
creator.create("Individual", list, fitness=creator.FitnessMulti, file_id=None)
# Initialize the toolbox
toolbox = base.Toolbox()
toolbox.register("individual", create_individual, creator.Individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evalModel)
toolbox.register("mate", customCrossover)
toolbox.register("mutate", customMutation, indpb=0.2)
toolbox.register("select", true_nsga2)
# TODO: start using percent diff of train acc vs val test acc as an over fitt metric
# 40398682
GEN_COUNT = -1
TOP_N_GENES = None
LINKED_GENES = {}
GLOBAL_DATA = {}
GLOBAL_DATA_HIST = {}
GLOBAL_DATA_ANCESTERY = {}
# Main Evolution Loop
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Run Generation')
# Add arguments
parser.add_argument('checkpoints', type=str, help='Save Dir')
# Parse the arguments
args = parser.parse_args()
print(DNA_TXT)
checkpoint, start_gen = load_checkpoint(folder_name=args.checkpoints)
if checkpoint:
box_print("LOADING CHECKPOINT")
GLOBAL_DATA = checkpoint["GLOBAL_DATA"]
GLOBAL_DATA_HIST = checkpoint["GLOBAL_DATA_HIST"]
GLOBAL_DATA_ANCESTERY = checkpoint["GLOBAL_DATA_ANCESTERY"]
population = checkpoint["population"]
hof = checkpoint["hof"]
else:
# Create an initial population
start_gen = 0
box_print("CREATING POPULATION FROM SEED CODE")
population = toolbox.population(n=start_population_size)
box_print("Batch Checking Created Genes", print_bbox_len=60, new_line_end=False)
delayed_creation_check(population)
hof = tools.HallOfFame(hof_size)
# Evaluate the entire population
for ind in population:
ind.fitness.values = PLACEHOLDER_FITNESS
check_and_update_fitness(population)
# print_ancestery(GLOBAL_DATA_ANCESTERY)
# Evolution
for gen in range(start_gen, num_generations):
GEN_COUNT = gen
TOP_N_GENES = tools.selSPEA2(population, NUM_EOT_ELITES)
box_print(f"STARTING GENERATION: {gen}", new_line_end=False)
print_population(population, GLOBAL_DATA)
box_print(f"Invalid Removal", print_bbox_len=60, new_line_end=False)
# Remove individuals with placeholder fitness
population = [ind for ind in population if ind.fitness.values != INVALID_FITNESS_MAX]
print_population(population, GLOBAL_DATA)
# Select the next generation's parents
box_print(f"Selection", print_bbox_len=60, new_line_end=False)
# These bypass the mutation and cross-over so we dont lose them
elites = tools.selSPEA2(population, num_elites)
# Select the next generation's parents
offspring = toolbox.select(population, population_size)
print_population(offspring, GLOBAL_DATA)
print([len(GLOBAL_DATA_HIST), len(GLOBAL_DATA), len(population), len(offspring)])
# Clone the selected individuals
offspring = list(map(toolbox.clone, offspring))
GLOBAL_DATA_HIST.update(GLOBAL_DATA.copy())
# box_print(f"GLOBAL_DATA_ANCESTERY", new_line_end=False)
# print_ancestery(GLOBAL_DATA_ANCESTERY)
# Apply crossover on the offspring
box_print("Mating", print_bbox_len=60, new_line_end=False)
for child1, child2 in zip(offspring[::2], offspring[1::2]):
if random.random() < crossover_probability:
child1, child2 = toolbox.mate(child1, child2)
del child1.fitness.values
del child2.fitness.values
# box_print(f"GLOBAL_DATA_ANCESTERY", new_line_end=False)
# print_ancestery(GLOBAL_DATA_ANCESTERY)
box_print("Batch Checking Mated Genes", print_bbox_len=60, new_line_end=False)
offspring = delayed_mate_check(offspring)
print_population(offspring, GLOBAL_DATA)
# Apply mutation on the offspring
box_print("Mutating", print_bbox_len=60, new_line_end=False)
for mutant in offspring:
if random.random() < mutation_probability:
toolbox.mutate(mutant)
del mutant.fitness.values
box_print(f"GLOBAL_DATA_ANCESTERY", new_line_end=False)
print_ancestery(GLOBAL_DATA_ANCESTERY)
box_print("Batch Checking Mutated Genes", print_bbox_len=60, new_line_end=False)
offspring = delayed_mutate_check(offspring)
print_population(offspring, GLOBAL_DATA)
# Add elites back to offspring. Usually before the mute and cross but in this case we save them
offspring.extend(elites)
# After merging the offspring and the elites
offspring = remove_duplicates(offspring)
elites_keys = [k[0] for k in elites]
# Bring back the elite history
for k in elites_keys:
if k in GLOBAL_DATA_HIST.keys():
GLOBAL_DATA[k] = GLOBAL_DATA_HIST[k]
"""
GLOBAL_DATA should have the job information and fitness values
When it hits the below in check_and_update_fitness it will load the results from the dict
if GLOBAL_DATA[gene_id]['status'] == "completed":
"""
# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
fitnesses = map(toolbox.evaluate, invalid_ind)
for ind in offspring:
# assign placeholder to all so I can check them all at once
ind.fitness.values = PLACEHOLDER_FITNESS
GLOBAL_DATA_HIST.update(GLOBAL_DATA.copy())
check_and_update_fitness(offspring)
GLOBAL_DATA_HIST.update(GLOBAL_DATA.copy())
# Replace the old population with the offspring
population[:] = offspring
# Gather all the fitnesses in one list and print the stats
print_scores(population, FITNESS_WEIGHTS)
hof.update(population)
save_checkpoint(gen, folder_name=args.checkpoints)
LINKED_GENES = {}
# mutate x prompts
mutate_prompts()
print("-- End of Evolution --")
best_ind = tools.selBest(population, 1)[0]
print(f"Best Individual: {best_ind}")
print(f"Best Fitness: {best_ind.fitness.values}")