Skip to content

Instantly share code, notes, and snippets.

View a-r-r-o-w's full-sized avatar
:octocat:
wandering on a rock

Aryan V S a-r-r-o-w

:octocat:
wandering on a rock
View GitHub Profile
@a-r-r-o-w
a-r-r-o-w / pipeline_parallel.py
Created October 2, 2024 10:50 — forked from 3outeille/pipeline_parallel.py
Self contained example of how pipeline parallel works (AFAB and 1F1B) in 200 LOC
#VERBOSE=0 torchrun --nproc_per_node 3 self_contained_pp_LOC.py
import os, random, numpy as np, torch, torch.nn as nn, torch.distributed as dist, torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, DistributedSampler
from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
STEP, local_rank, world_size, verbose = 0, int(os.environ["LOCAL_RANK"]), int(os.environ["WORLD_SIZE"]), os.environ.get("VERBOSE", "0") == "1"
def set_all_seed(seed):
@a-r-r-o-w
a-r-r-o-w / ffmpeg_dump.md
Last active January 6, 2025 03:12
ffmpeg dump
  1. Extract first frame from video and save as PNG
for file in *; do ffmpeg -y -i "$file" -frames:v 1 "../images/${file%.*}.png"; done
  1. Horizontally stack multiple videos
ffmpeg -i a.mp4 -i b.mp4 -i c.mp4 -i d.mp4 -filter_complex hstack=inputs=4 output.mp4
@a-r-r-o-w
a-r-r-o-w / simple_mlp_tp.py
Last active February 15, 2025 21:47
TP on simple MLP. Applies TP in 4 different ways
import copy
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.distributed.tensor.device_mesh import DeviceMesh
from torch.distributed.tensor import Replicate, Shard, DTensor
from torch.distributed.tensor.parallel.style import RowwiseParallel, ColwiseParallel, SequenceParallel
from torch.distributed.tensor.parallel.api import parallelize_module
from torch._utils import _get_device_module
@a-r-r-o-w
a-r-r-o-w / profile_multiple_offload_flux.py
Created February 21, 2025 03:46
Tests multiple offloading mechanisms and gathers there CPU and CUDA memory/time usage on a single A100 GPU for Flux
import argparse
import functools
import json
import os
import pathlib
import psutil
import time
import torch
from diffusers import FluxPipeline
import torch
import torch.distributed as dist
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.utils import export_to_video
from finetrainers._metadata import ParamId, CPInput, CPOutput
from finetrainers.parallel.ptd import apply_context_parallel
from finetrainers.models.attention_dispatch import attention_provider, attention_dispatch
torch.nn.functional.scaled_dot_product_attention = attention_dispatch
# Reference: https://github.com/arcee-ai/mergekit/blob/488957e8e67c82861ecf63ef761f6bc59122dc74/mergekit/scripts/extract_lora.py
import argparse
import torch
from safetensors.torch import load_file, save_file
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.preferred_linalg_library("cusolver")
@a-r-r-o-w
a-r-r-o-w / attention.py
Created May 30, 2025 04:31
Copy-pastable implementation for various attention backends
import contextlib
import functools
import inspect
import os
from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
FINETRAINERS_ATTN_CHECKS = os.environ.get("FINETRAINERS_ATTN_CHECKS", "0").lower() in ("1", "true", "yes")
FINETRAINERS_ATTN_PROVIDER = os.environ.get("FINETRAINERS_ATTN_PROVIDER", "native").lower()
import argparse
import contextlib
import math
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch._inductor.config
import torch._higher_order_ops.auto_functionalize as af
import argparse
import contextlib
import math
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch._inductor.config
import torch._higher_order_ops.auto_functionalize as af