This matrix displays the results of matches among six LLMs and a random play generator software. Each LLM played against every other LLM (excluding itself) five times. Each row shows the results of the games between the LLM in the 'Player' column and the other LLMs.
Each 'player' played 5 games with each corresponding opponent for a selected game and prompt type combination, totaling 25 games per player for list and illustration prompts, and 20 games per player for the image prompt type.
Game | Prompt | Player | Claude-3-Sonnet | Gemini-1.5-flash | Gemini-1.5-pro | GPT-4-turbo | GPT-4o | Llama3-70b | Random-Play | ||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | W1st | W2nd | Dr | Dq1st | Dq2nd | |||
TicTacToe | List | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 2 | 0 | 3 | 0 | 0 | 2 | 2 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 2 |
Gemini-1.5-flash | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 1 | 2 | 1 | 0 | 1 | ||
Gemini-1.5-pro | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | ||
GPT-4-turbo | 1 | 1 | 2 | 1 | 0 | 1 | 4 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2 | ||
GPT-4o | 5 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Llama3-70b | 4 | 0 | 1 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 3 | ||
Random-Play | 2 | 0 | 0 | 3 | 0 | 1 | 2 | 0 | 2 | 0 | 0 | 2 | 0 | 3 | 0 | 1 | 1 | 0 | 3 | 0 | 1 | 2 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
Illustration | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 2 | 2 | 3 | 2 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 2 | |
Gemini-1.5-flash | 1 | 1 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 5 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | ||
Gemini-1.5-pro | 1 | 3 | 1 | 0 | 0 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | ||
GPT-4-turbo | 4 | 1 | 0 | 0 | 0 | 2 | 0 | 1 | 0 | 2 | 1 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | ||
GPT-4o | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | ||
Llama3-70b | 3 | 2 | 0 | 0 | 0 | 3 | 0 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | ||
Random-Play | 2 | 0 | 0 | 3 | 0 | 1 | 1 | 0 | 2 | 1 | 1 | 2 | 0 | 2 | 0 | 1 | 1 | 0 | 3 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | ||
Image | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 0 | 0 | 2 | 1 | 1 | 3 | 0 | 1 | 0 | 1 | 1 | 0 | 3 | 0 | 3 | 0 | 0 | 0 | 2 | ||||||
Gemini-1.5-flash | 2 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 0 | 1 | 0 | 0 | 1 | 3 | 2 | 1 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 3 | |||||||
Gemini-1.5-pro | 1 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 2 | 0 | 1 | 0 | 4 | 0 | 1 | 1 | 0 | 1 | 2 | |||||||
GPT-4-turbo | 5 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 | 0 | 3 | 0 | 0 | 1 | 1 | |||||||
GPT-4o | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 2 | 0 | 2 | 0 | 0 | 0 | 3 | 1 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | |||||||
Random-Play | 1 | 0 | 0 | 3 | 1 | 1 | 2 | 0 | 2 | 0 | 1 | 2 | 0 | 2 | 0 | 2 | 0 | 0 | 0 | 3 | 0 | 1 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | |||||||
Connect4 | List | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 |
Gemini-1.5-flash | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Gemini-1.5-pro | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 1 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | ||
GPT-4-turbo | 4 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | ||
GPT-4o | 4 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Llama3-70b | 2 | 3 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | ||
Random-Play | 1 | 4 | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 1 | 3 | 2 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
Illustration | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 1 | 0 | 2 | 2 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | |
Gemini-1.5-flash | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 3 | 4 | 0 | 0 | 1 | 0 | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Gemini-1.5-pro | 3 | 2 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 3 | 0 | 3 | 1 | 0 | 1 | 0 | ||
GPT-4-turbo | 1 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 3 | 0 | 0 | 2 | 0 | ||
GPT-4o | 2 | 1 | 0 | 0 | 2 | 4 | 1 | 0 | 0 | 0 | 2 | 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | ||
Llama3-70b | 3 | 2 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | ||
Random-Play | 1 | 4 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
Image | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 3 | 1 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 3 | 2 | 4 | 0 | 0 | 1 | 0 | ||||||
Gemini-1.5-flash | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 | 0 | 4 | 0 | 0 | 1 | 0 | |||||||
Gemini-1.5-pro | 0 | 0 | 0 | 4 | 1 | 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 2 | 1 | 0 | 2 | 0 | 2 | 2 | 0 | 1 | 0 | |||||||
GPT-4-turbo | 1 | 2 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 0 | 4 | 1 | 0 | 0 | 0 | |||||||
GPT-4o | 1 | 1 | 0 | 0 | 3 | 5 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 2 | 1 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | |||||||
Random-Play | 0 | 4 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |||||||
Gomoku | List | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 1 | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 |
Gemini-1.5-flash | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Gemini-1.5-pro | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 4 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
GPT-4-turbo | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 2 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
GPT-4o | 3 | 0 | 0 | 2 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 2 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Llama3-70b | 4 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | ||
Random-Play | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ||
Illustration | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 3 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 5 | 0 | 0 | 0 | 0 | |
Gemini-1.5-flash | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 1 | 0 | 4 | 0 | 0 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 0 | 0 | 2 | 0 | ||
Gemini-1.5-pro | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 2 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 4 | 3 | 0 | 0 | 2 | 0 | ||
GPT-4-turbo | 3 | 0 | 0 | 2 | 0 | 2 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 4 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | ||
GPT-4o | 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 5 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | 5 | 0 | 0 | 0 | 0 | ||
Llama3-70b | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 1 | 4 | 0 | 3 | 0 | 1 | 1 | 3 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 3 | 0 | ||
Random-Play | 0 | 3 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 3 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | ||
Image | Claude-3-Sonnet | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 4 | 1 | 4 | 0 | 0 | 1 | 0 | ||||||
Gemini-1.5-flash | 2 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 4 | 1 | 3 | 0 | 0 | 2 | 0 | |||||||
Gemini-1.5-pro | 1 | 0 | 0 | 3 | 1 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 5 | 0 | 3 | 0 | 0 | 2 | 0 | |||||||
GPT-4-turbo | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 5 | 0 | |||||||
GPT-4o | 0 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 1 | 4 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 1 | 0 | |||||||
Random-Play | 0 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 2 | 0 | 4 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 4 | 0 | 3 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |